Path: blob/master/3rdparty/libwebp/src/dsp/dec_neon.c
16348 views
// Copyright 2012 Google Inc. All Rights Reserved.1//2// Use of this source code is governed by a BSD-style license3// that can be found in the COPYING file in the root of the source4// tree. An additional intellectual property rights grant can be found5// in the file PATENTS. All contributing project authors may6// be found in the AUTHORS file in the root of the source tree.7// -----------------------------------------------------------------------------8//9// ARM NEON version of dsp functions and loop filtering.10//11// Authors: Somnath Banerjee ([email protected])12// Johann Koenig ([email protected])1314#include "src/dsp/dsp.h"1516#if defined(WEBP_USE_NEON)1718#include "src/dsp/neon.h"19#include "src/dec/vp8i_dec.h"2021//------------------------------------------------------------------------------22// NxM Loading functions2324#if !defined(WORK_AROUND_GCC)2526// This intrinsics version makes gcc-4.6.3 crash during Load4x??() compilation27// (register alloc, probably). The variants somewhat mitigate the problem, but28// not quite. HFilter16i() remains problematic.29static WEBP_INLINE uint8x8x4_t Load4x8_NEON(const uint8_t* const src,30int stride) {31const uint8x8_t zero = vdup_n_u8(0);32uint8x8x4_t out;33INIT_VECTOR4(out, zero, zero, zero, zero);34out = vld4_lane_u8(src + 0 * stride, out, 0);35out = vld4_lane_u8(src + 1 * stride, out, 1);36out = vld4_lane_u8(src + 2 * stride, out, 2);37out = vld4_lane_u8(src + 3 * stride, out, 3);38out = vld4_lane_u8(src + 4 * stride, out, 4);39out = vld4_lane_u8(src + 5 * stride, out, 5);40out = vld4_lane_u8(src + 6 * stride, out, 6);41out = vld4_lane_u8(src + 7 * stride, out, 7);42return out;43}4445static WEBP_INLINE void Load4x16_NEON(const uint8_t* const src, int stride,46uint8x16_t* const p1,47uint8x16_t* const p0,48uint8x16_t* const q0,49uint8x16_t* const q1) {50// row0 = p1[0..7]|p0[0..7]|q0[0..7]|q1[0..7]51// row8 = p1[8..15]|p0[8..15]|q0[8..15]|q1[8..15]52const uint8x8x4_t row0 = Load4x8_NEON(src - 2 + 0 * stride, stride);53const uint8x8x4_t row8 = Load4x8_NEON(src - 2 + 8 * stride, stride);54*p1 = vcombine_u8(row0.val[0], row8.val[0]);55*p0 = vcombine_u8(row0.val[1], row8.val[1]);56*q0 = vcombine_u8(row0.val[2], row8.val[2]);57*q1 = vcombine_u8(row0.val[3], row8.val[3]);58}5960#else // WORK_AROUND_GCC6162#define LOADQ_LANE_32b(VALUE, LANE) do { \63(VALUE) = vld1q_lane_u32((const uint32_t*)src, (VALUE), (LANE)); \64src += stride; \65} while (0)6667static WEBP_INLINE void Load4x16_NEON(const uint8_t* src, int stride,68uint8x16_t* const p1,69uint8x16_t* const p0,70uint8x16_t* const q0,71uint8x16_t* const q1) {72const uint32x4_t zero = vdupq_n_u32(0);73uint32x4x4_t in;74INIT_VECTOR4(in, zero, zero, zero, zero);75src -= 2;76LOADQ_LANE_32b(in.val[0], 0);77LOADQ_LANE_32b(in.val[1], 0);78LOADQ_LANE_32b(in.val[2], 0);79LOADQ_LANE_32b(in.val[3], 0);80LOADQ_LANE_32b(in.val[0], 1);81LOADQ_LANE_32b(in.val[1], 1);82LOADQ_LANE_32b(in.val[2], 1);83LOADQ_LANE_32b(in.val[3], 1);84LOADQ_LANE_32b(in.val[0], 2);85LOADQ_LANE_32b(in.val[1], 2);86LOADQ_LANE_32b(in.val[2], 2);87LOADQ_LANE_32b(in.val[3], 2);88LOADQ_LANE_32b(in.val[0], 3);89LOADQ_LANE_32b(in.val[1], 3);90LOADQ_LANE_32b(in.val[2], 3);91LOADQ_LANE_32b(in.val[3], 3);92// Transpose four 4x4 parts:93{94const uint8x16x2_t row01 = vtrnq_u8(vreinterpretq_u8_u32(in.val[0]),95vreinterpretq_u8_u32(in.val[1]));96const uint8x16x2_t row23 = vtrnq_u8(vreinterpretq_u8_u32(in.val[2]),97vreinterpretq_u8_u32(in.val[3]));98const uint16x8x2_t row02 = vtrnq_u16(vreinterpretq_u16_u8(row01.val[0]),99vreinterpretq_u16_u8(row23.val[0]));100const uint16x8x2_t row13 = vtrnq_u16(vreinterpretq_u16_u8(row01.val[1]),101vreinterpretq_u16_u8(row23.val[1]));102*p1 = vreinterpretq_u8_u16(row02.val[0]);103*p0 = vreinterpretq_u8_u16(row13.val[0]);104*q0 = vreinterpretq_u8_u16(row02.val[1]);105*q1 = vreinterpretq_u8_u16(row13.val[1]);106}107}108#undef LOADQ_LANE_32b109110#endif // !WORK_AROUND_GCC111112static WEBP_INLINE void Load8x16_NEON(113const uint8_t* const src, int stride,114uint8x16_t* const p3, uint8x16_t* const p2, uint8x16_t* const p1,115uint8x16_t* const p0, uint8x16_t* const q0, uint8x16_t* const q1,116uint8x16_t* const q2, uint8x16_t* const q3) {117Load4x16_NEON(src - 2, stride, p3, p2, p1, p0);118Load4x16_NEON(src + 2, stride, q0, q1, q2, q3);119}120121static WEBP_INLINE void Load16x4_NEON(const uint8_t* const src, int stride,122uint8x16_t* const p1,123uint8x16_t* const p0,124uint8x16_t* const q0,125uint8x16_t* const q1) {126*p1 = vld1q_u8(src - 2 * stride);127*p0 = vld1q_u8(src - 1 * stride);128*q0 = vld1q_u8(src + 0 * stride);129*q1 = vld1q_u8(src + 1 * stride);130}131132static WEBP_INLINE void Load16x8_NEON(133const uint8_t* const src, int stride,134uint8x16_t* const p3, uint8x16_t* const p2, uint8x16_t* const p1,135uint8x16_t* const p0, uint8x16_t* const q0, uint8x16_t* const q1,136uint8x16_t* const q2, uint8x16_t* const q3) {137Load16x4_NEON(src - 2 * stride, stride, p3, p2, p1, p0);138Load16x4_NEON(src + 2 * stride, stride, q0, q1, q2, q3);139}140141static WEBP_INLINE void Load8x8x2_NEON(142const uint8_t* const u, const uint8_t* const v, int stride,143uint8x16_t* const p3, uint8x16_t* const p2, uint8x16_t* const p1,144uint8x16_t* const p0, uint8x16_t* const q0, uint8x16_t* const q1,145uint8x16_t* const q2, uint8x16_t* const q3) {146// We pack the 8x8 u-samples in the lower half of the uint8x16_t destination147// and the v-samples on the higher half.148*p3 = vcombine_u8(vld1_u8(u - 4 * stride), vld1_u8(v - 4 * stride));149*p2 = vcombine_u8(vld1_u8(u - 3 * stride), vld1_u8(v - 3 * stride));150*p1 = vcombine_u8(vld1_u8(u - 2 * stride), vld1_u8(v - 2 * stride));151*p0 = vcombine_u8(vld1_u8(u - 1 * stride), vld1_u8(v - 1 * stride));152*q0 = vcombine_u8(vld1_u8(u + 0 * stride), vld1_u8(v + 0 * stride));153*q1 = vcombine_u8(vld1_u8(u + 1 * stride), vld1_u8(v + 1 * stride));154*q2 = vcombine_u8(vld1_u8(u + 2 * stride), vld1_u8(v + 2 * stride));155*q3 = vcombine_u8(vld1_u8(u + 3 * stride), vld1_u8(v + 3 * stride));156}157158#if !defined(WORK_AROUND_GCC)159160#define LOAD_UV_8(ROW) \161vcombine_u8(vld1_u8(u - 4 + (ROW) * stride), vld1_u8(v - 4 + (ROW) * stride))162163static WEBP_INLINE void Load8x8x2T_NEON(164const uint8_t* const u, const uint8_t* const v, int stride,165uint8x16_t* const p3, uint8x16_t* const p2, uint8x16_t* const p1,166uint8x16_t* const p0, uint8x16_t* const q0, uint8x16_t* const q1,167uint8x16_t* const q2, uint8x16_t* const q3) {168// We pack the 8x8 u-samples in the lower half of the uint8x16_t destination169// and the v-samples on the higher half.170const uint8x16_t row0 = LOAD_UV_8(0);171const uint8x16_t row1 = LOAD_UV_8(1);172const uint8x16_t row2 = LOAD_UV_8(2);173const uint8x16_t row3 = LOAD_UV_8(3);174const uint8x16_t row4 = LOAD_UV_8(4);175const uint8x16_t row5 = LOAD_UV_8(5);176const uint8x16_t row6 = LOAD_UV_8(6);177const uint8x16_t row7 = LOAD_UV_8(7);178// Perform two side-by-side 8x8 transposes179// u00 u01 u02 u03 u04 u05 u06 u07 | v00 v01 v02 v03 v04 v05 v06 v07180// u10 u11 u12 u13 u14 u15 u16 u17 | v10 v11 v12 ...181// u20 u21 u22 u23 u24 u25 u26 u27 | v20 v21 ...182// u30 u31 u32 u33 u34 u35 u36 u37 | ...183// u40 u41 u42 u43 u44 u45 u46 u47 | ...184// u50 u51 u52 u53 u54 u55 u56 u57 | ...185// u60 u61 u62 u63 u64 u65 u66 u67 | v60 ...186// u70 u71 u72 u73 u74 u75 u76 u77 | v70 v71 v72 ...187const uint8x16x2_t row01 = vtrnq_u8(row0, row1); // u00 u10 u02 u12 ...188// u01 u11 u03 u13 ...189const uint8x16x2_t row23 = vtrnq_u8(row2, row3); // u20 u30 u22 u32 ...190// u21 u31 u23 u33 ...191const uint8x16x2_t row45 = vtrnq_u8(row4, row5); // ...192const uint8x16x2_t row67 = vtrnq_u8(row6, row7); // ...193const uint16x8x2_t row02 = vtrnq_u16(vreinterpretq_u16_u8(row01.val[0]),194vreinterpretq_u16_u8(row23.val[0]));195const uint16x8x2_t row13 = vtrnq_u16(vreinterpretq_u16_u8(row01.val[1]),196vreinterpretq_u16_u8(row23.val[1]));197const uint16x8x2_t row46 = vtrnq_u16(vreinterpretq_u16_u8(row45.val[0]),198vreinterpretq_u16_u8(row67.val[0]));199const uint16x8x2_t row57 = vtrnq_u16(vreinterpretq_u16_u8(row45.val[1]),200vreinterpretq_u16_u8(row67.val[1]));201const uint32x4x2_t row04 = vtrnq_u32(vreinterpretq_u32_u16(row02.val[0]),202vreinterpretq_u32_u16(row46.val[0]));203const uint32x4x2_t row26 = vtrnq_u32(vreinterpretq_u32_u16(row02.val[1]),204vreinterpretq_u32_u16(row46.val[1]));205const uint32x4x2_t row15 = vtrnq_u32(vreinterpretq_u32_u16(row13.val[0]),206vreinterpretq_u32_u16(row57.val[0]));207const uint32x4x2_t row37 = vtrnq_u32(vreinterpretq_u32_u16(row13.val[1]),208vreinterpretq_u32_u16(row57.val[1]));209*p3 = vreinterpretq_u8_u32(row04.val[0]);210*p2 = vreinterpretq_u8_u32(row15.val[0]);211*p1 = vreinterpretq_u8_u32(row26.val[0]);212*p0 = vreinterpretq_u8_u32(row37.val[0]);213*q0 = vreinterpretq_u8_u32(row04.val[1]);214*q1 = vreinterpretq_u8_u32(row15.val[1]);215*q2 = vreinterpretq_u8_u32(row26.val[1]);216*q3 = vreinterpretq_u8_u32(row37.val[1]);217}218#undef LOAD_UV_8219220#endif // !WORK_AROUND_GCC221222static WEBP_INLINE void Store2x8_NEON(const uint8x8x2_t v,223uint8_t* const dst, int stride) {224vst2_lane_u8(dst + 0 * stride, v, 0);225vst2_lane_u8(dst + 1 * stride, v, 1);226vst2_lane_u8(dst + 2 * stride, v, 2);227vst2_lane_u8(dst + 3 * stride, v, 3);228vst2_lane_u8(dst + 4 * stride, v, 4);229vst2_lane_u8(dst + 5 * stride, v, 5);230vst2_lane_u8(dst + 6 * stride, v, 6);231vst2_lane_u8(dst + 7 * stride, v, 7);232}233234static WEBP_INLINE void Store2x16_NEON(const uint8x16_t p0, const uint8x16_t q0,235uint8_t* const dst, int stride) {236uint8x8x2_t lo, hi;237lo.val[0] = vget_low_u8(p0);238lo.val[1] = vget_low_u8(q0);239hi.val[0] = vget_high_u8(p0);240hi.val[1] = vget_high_u8(q0);241Store2x8_NEON(lo, dst - 1 + 0 * stride, stride);242Store2x8_NEON(hi, dst - 1 + 8 * stride, stride);243}244245#if !defined(WORK_AROUND_GCC)246static WEBP_INLINE void Store4x8_NEON(const uint8x8x4_t v,247uint8_t* const dst, int stride) {248vst4_lane_u8(dst + 0 * stride, v, 0);249vst4_lane_u8(dst + 1 * stride, v, 1);250vst4_lane_u8(dst + 2 * stride, v, 2);251vst4_lane_u8(dst + 3 * stride, v, 3);252vst4_lane_u8(dst + 4 * stride, v, 4);253vst4_lane_u8(dst + 5 * stride, v, 5);254vst4_lane_u8(dst + 6 * stride, v, 6);255vst4_lane_u8(dst + 7 * stride, v, 7);256}257258static WEBP_INLINE void Store4x16_NEON(const uint8x16_t p1, const uint8x16_t p0,259const uint8x16_t q0, const uint8x16_t q1,260uint8_t* const dst, int stride) {261uint8x8x4_t lo, hi;262INIT_VECTOR4(lo,263vget_low_u8(p1), vget_low_u8(p0),264vget_low_u8(q0), vget_low_u8(q1));265INIT_VECTOR4(hi,266vget_high_u8(p1), vget_high_u8(p0),267vget_high_u8(q0), vget_high_u8(q1));268Store4x8_NEON(lo, dst - 2 + 0 * stride, stride);269Store4x8_NEON(hi, dst - 2 + 8 * stride, stride);270}271#endif // !WORK_AROUND_GCC272273static WEBP_INLINE void Store16x2_NEON(const uint8x16_t p0, const uint8x16_t q0,274uint8_t* const dst, int stride) {275vst1q_u8(dst - stride, p0);276vst1q_u8(dst, q0);277}278279static WEBP_INLINE void Store16x4_NEON(const uint8x16_t p1, const uint8x16_t p0,280const uint8x16_t q0, const uint8x16_t q1,281uint8_t* const dst, int stride) {282Store16x2_NEON(p1, p0, dst - stride, stride);283Store16x2_NEON(q0, q1, dst + stride, stride);284}285286static WEBP_INLINE void Store8x2x2_NEON(const uint8x16_t p0,287const uint8x16_t q0,288uint8_t* const u, uint8_t* const v,289int stride) {290// p0 and q0 contain the u+v samples packed in low/high halves.291vst1_u8(u - stride, vget_low_u8(p0));292vst1_u8(u, vget_low_u8(q0));293vst1_u8(v - stride, vget_high_u8(p0));294vst1_u8(v, vget_high_u8(q0));295}296297static WEBP_INLINE void Store8x4x2_NEON(const uint8x16_t p1,298const uint8x16_t p0,299const uint8x16_t q0,300const uint8x16_t q1,301uint8_t* const u, uint8_t* const v,302int stride) {303// The p1...q1 registers contain the u+v samples packed in low/high halves.304Store8x2x2_NEON(p1, p0, u - stride, v - stride, stride);305Store8x2x2_NEON(q0, q1, u + stride, v + stride, stride);306}307308#if !defined(WORK_AROUND_GCC)309310#define STORE6_LANE(DST, VAL0, VAL1, LANE) do { \311vst3_lane_u8((DST) - 3, (VAL0), (LANE)); \312vst3_lane_u8((DST) + 0, (VAL1), (LANE)); \313(DST) += stride; \314} while (0)315316static WEBP_INLINE void Store6x8x2_NEON(317const uint8x16_t p2, const uint8x16_t p1, const uint8x16_t p0,318const uint8x16_t q0, const uint8x16_t q1, const uint8x16_t q2,319uint8_t* u, uint8_t* v, int stride) {320uint8x8x3_t u0, u1, v0, v1;321INIT_VECTOR3(u0, vget_low_u8(p2), vget_low_u8(p1), vget_low_u8(p0));322INIT_VECTOR3(u1, vget_low_u8(q0), vget_low_u8(q1), vget_low_u8(q2));323INIT_VECTOR3(v0, vget_high_u8(p2), vget_high_u8(p1), vget_high_u8(p0));324INIT_VECTOR3(v1, vget_high_u8(q0), vget_high_u8(q1), vget_high_u8(q2));325STORE6_LANE(u, u0, u1, 0);326STORE6_LANE(u, u0, u1, 1);327STORE6_LANE(u, u0, u1, 2);328STORE6_LANE(u, u0, u1, 3);329STORE6_LANE(u, u0, u1, 4);330STORE6_LANE(u, u0, u1, 5);331STORE6_LANE(u, u0, u1, 6);332STORE6_LANE(u, u0, u1, 7);333STORE6_LANE(v, v0, v1, 0);334STORE6_LANE(v, v0, v1, 1);335STORE6_LANE(v, v0, v1, 2);336STORE6_LANE(v, v0, v1, 3);337STORE6_LANE(v, v0, v1, 4);338STORE6_LANE(v, v0, v1, 5);339STORE6_LANE(v, v0, v1, 6);340STORE6_LANE(v, v0, v1, 7);341}342#undef STORE6_LANE343344static WEBP_INLINE void Store4x8x2_NEON(const uint8x16_t p1,345const uint8x16_t p0,346const uint8x16_t q0,347const uint8x16_t q1,348uint8_t* const u, uint8_t* const v,349int stride) {350uint8x8x4_t u0, v0;351INIT_VECTOR4(u0,352vget_low_u8(p1), vget_low_u8(p0),353vget_low_u8(q0), vget_low_u8(q1));354INIT_VECTOR4(v0,355vget_high_u8(p1), vget_high_u8(p0),356vget_high_u8(q0), vget_high_u8(q1));357vst4_lane_u8(u - 2 + 0 * stride, u0, 0);358vst4_lane_u8(u - 2 + 1 * stride, u0, 1);359vst4_lane_u8(u - 2 + 2 * stride, u0, 2);360vst4_lane_u8(u - 2 + 3 * stride, u0, 3);361vst4_lane_u8(u - 2 + 4 * stride, u0, 4);362vst4_lane_u8(u - 2 + 5 * stride, u0, 5);363vst4_lane_u8(u - 2 + 6 * stride, u0, 6);364vst4_lane_u8(u - 2 + 7 * stride, u0, 7);365vst4_lane_u8(v - 2 + 0 * stride, v0, 0);366vst4_lane_u8(v - 2 + 1 * stride, v0, 1);367vst4_lane_u8(v - 2 + 2 * stride, v0, 2);368vst4_lane_u8(v - 2 + 3 * stride, v0, 3);369vst4_lane_u8(v - 2 + 4 * stride, v0, 4);370vst4_lane_u8(v - 2 + 5 * stride, v0, 5);371vst4_lane_u8(v - 2 + 6 * stride, v0, 6);372vst4_lane_u8(v - 2 + 7 * stride, v0, 7);373}374375#endif // !WORK_AROUND_GCC376377// Zero extend 'v' to an int16x8_t.378static WEBP_INLINE int16x8_t ConvertU8ToS16_NEON(uint8x8_t v) {379return vreinterpretq_s16_u16(vmovl_u8(v));380}381382// Performs unsigned 8b saturation on 'dst01' and 'dst23' storing the result383// to the corresponding rows of 'dst'.384static WEBP_INLINE void SaturateAndStore4x4_NEON(uint8_t* const dst,385const int16x8_t dst01,386const int16x8_t dst23) {387// Unsigned saturate to 8b.388const uint8x8_t dst01_u8 = vqmovun_s16(dst01);389const uint8x8_t dst23_u8 = vqmovun_s16(dst23);390391// Store the results.392vst1_lane_u32((uint32_t*)(dst + 0 * BPS), vreinterpret_u32_u8(dst01_u8), 0);393vst1_lane_u32((uint32_t*)(dst + 1 * BPS), vreinterpret_u32_u8(dst01_u8), 1);394vst1_lane_u32((uint32_t*)(dst + 2 * BPS), vreinterpret_u32_u8(dst23_u8), 0);395vst1_lane_u32((uint32_t*)(dst + 3 * BPS), vreinterpret_u32_u8(dst23_u8), 1);396}397398static WEBP_INLINE void Add4x4_NEON(const int16x8_t row01,399const int16x8_t row23,400uint8_t* const dst) {401uint32x2_t dst01 = vdup_n_u32(0);402uint32x2_t dst23 = vdup_n_u32(0);403404// Load the source pixels.405dst01 = vld1_lane_u32((uint32_t*)(dst + 0 * BPS), dst01, 0);406dst23 = vld1_lane_u32((uint32_t*)(dst + 2 * BPS), dst23, 0);407dst01 = vld1_lane_u32((uint32_t*)(dst + 1 * BPS), dst01, 1);408dst23 = vld1_lane_u32((uint32_t*)(dst + 3 * BPS), dst23, 1);409410{411// Convert to 16b.412const int16x8_t dst01_s16 = ConvertU8ToS16_NEON(vreinterpret_u8_u32(dst01));413const int16x8_t dst23_s16 = ConvertU8ToS16_NEON(vreinterpret_u8_u32(dst23));414415// Descale with rounding.416const int16x8_t out01 = vrsraq_n_s16(dst01_s16, row01, 3);417const int16x8_t out23 = vrsraq_n_s16(dst23_s16, row23, 3);418// Add the inverse transform.419SaturateAndStore4x4_NEON(dst, out01, out23);420}421}422423//-----------------------------------------------------------------------------424// Simple In-loop filtering (Paragraph 15.2)425426static uint8x16_t NeedsFilter_NEON(const uint8x16_t p1, const uint8x16_t p0,427const uint8x16_t q0, const uint8x16_t q1,428int thresh) {429const uint8x16_t thresh_v = vdupq_n_u8((uint8_t)thresh);430const uint8x16_t a_p0_q0 = vabdq_u8(p0, q0); // abs(p0-q0)431const uint8x16_t a_p1_q1 = vabdq_u8(p1, q1); // abs(p1-q1)432const uint8x16_t a_p0_q0_2 = vqaddq_u8(a_p0_q0, a_p0_q0); // 2 * abs(p0-q0)433const uint8x16_t a_p1_q1_2 = vshrq_n_u8(a_p1_q1, 1); // abs(p1-q1) / 2434const uint8x16_t sum = vqaddq_u8(a_p0_q0_2, a_p1_q1_2);435const uint8x16_t mask = vcgeq_u8(thresh_v, sum);436return mask;437}438439static int8x16_t FlipSign_NEON(const uint8x16_t v) {440const uint8x16_t sign_bit = vdupq_n_u8(0x80);441return vreinterpretq_s8_u8(veorq_u8(v, sign_bit));442}443444static uint8x16_t FlipSignBack_NEON(const int8x16_t v) {445const int8x16_t sign_bit = vdupq_n_s8(0x80);446return vreinterpretq_u8_s8(veorq_s8(v, sign_bit));447}448449static int8x16_t GetBaseDelta_NEON(const int8x16_t p1, const int8x16_t p0,450const int8x16_t q0, const int8x16_t q1) {451const int8x16_t q0_p0 = vqsubq_s8(q0, p0); // (q0-p0)452const int8x16_t p1_q1 = vqsubq_s8(p1, q1); // (p1-q1)453const int8x16_t s1 = vqaddq_s8(p1_q1, q0_p0); // (p1-q1) + 1 * (q0 - p0)454const int8x16_t s2 = vqaddq_s8(q0_p0, s1); // (p1-q1) + 2 * (q0 - p0)455const int8x16_t s3 = vqaddq_s8(q0_p0, s2); // (p1-q1) + 3 * (q0 - p0)456return s3;457}458459static int8x16_t GetBaseDelta0_NEON(const int8x16_t p0, const int8x16_t q0) {460const int8x16_t q0_p0 = vqsubq_s8(q0, p0); // (q0-p0)461const int8x16_t s1 = vqaddq_s8(q0_p0, q0_p0); // 2 * (q0 - p0)462const int8x16_t s2 = vqaddq_s8(q0_p0, s1); // 3 * (q0 - p0)463return s2;464}465466//------------------------------------------------------------------------------467468static void ApplyFilter2NoFlip_NEON(const int8x16_t p0s, const int8x16_t q0s,469const int8x16_t delta,470int8x16_t* const op0,471int8x16_t* const oq0) {472const int8x16_t kCst3 = vdupq_n_s8(0x03);473const int8x16_t kCst4 = vdupq_n_s8(0x04);474const int8x16_t delta_p3 = vqaddq_s8(delta, kCst3);475const int8x16_t delta_p4 = vqaddq_s8(delta, kCst4);476const int8x16_t delta3 = vshrq_n_s8(delta_p3, 3);477const int8x16_t delta4 = vshrq_n_s8(delta_p4, 3);478*op0 = vqaddq_s8(p0s, delta3);479*oq0 = vqsubq_s8(q0s, delta4);480}481482#if defined(WEBP_USE_INTRINSICS)483484static void ApplyFilter2_NEON(const int8x16_t p0s, const int8x16_t q0s,485const int8x16_t delta,486uint8x16_t* const op0, uint8x16_t* const oq0) {487const int8x16_t kCst3 = vdupq_n_s8(0x03);488const int8x16_t kCst4 = vdupq_n_s8(0x04);489const int8x16_t delta_p3 = vqaddq_s8(delta, kCst3);490const int8x16_t delta_p4 = vqaddq_s8(delta, kCst4);491const int8x16_t delta3 = vshrq_n_s8(delta_p3, 3);492const int8x16_t delta4 = vshrq_n_s8(delta_p4, 3);493const int8x16_t sp0 = vqaddq_s8(p0s, delta3);494const int8x16_t sq0 = vqsubq_s8(q0s, delta4);495*op0 = FlipSignBack_NEON(sp0);496*oq0 = FlipSignBack_NEON(sq0);497}498499static void DoFilter2_NEON(const uint8x16_t p1, const uint8x16_t p0,500const uint8x16_t q0, const uint8x16_t q1,501const uint8x16_t mask,502uint8x16_t* const op0, uint8x16_t* const oq0) {503const int8x16_t p1s = FlipSign_NEON(p1);504const int8x16_t p0s = FlipSign_NEON(p0);505const int8x16_t q0s = FlipSign_NEON(q0);506const int8x16_t q1s = FlipSign_NEON(q1);507const int8x16_t delta0 = GetBaseDelta_NEON(p1s, p0s, q0s, q1s);508const int8x16_t delta1 = vandq_s8(delta0, vreinterpretq_s8_u8(mask));509ApplyFilter2_NEON(p0s, q0s, delta1, op0, oq0);510}511512static void SimpleVFilter16_NEON(uint8_t* p, int stride, int thresh) {513uint8x16_t p1, p0, q0, q1, op0, oq0;514Load16x4_NEON(p, stride, &p1, &p0, &q0, &q1);515{516const uint8x16_t mask = NeedsFilter_NEON(p1, p0, q0, q1, thresh);517DoFilter2_NEON(p1, p0, q0, q1, mask, &op0, &oq0);518}519Store16x2_NEON(op0, oq0, p, stride);520}521522static void SimpleHFilter16_NEON(uint8_t* p, int stride, int thresh) {523uint8x16_t p1, p0, q0, q1, oq0, op0;524Load4x16_NEON(p, stride, &p1, &p0, &q0, &q1);525{526const uint8x16_t mask = NeedsFilter_NEON(p1, p0, q0, q1, thresh);527DoFilter2_NEON(p1, p0, q0, q1, mask, &op0, &oq0);528}529Store2x16_NEON(op0, oq0, p, stride);530}531532#else533534// Load/Store vertical edge535#define LOAD8x4(c1, c2, c3, c4, b1, b2, stride) \536"vld4.8 {" #c1 "[0]," #c2 "[0]," #c3 "[0]," #c4 "[0]}," #b1 "," #stride "\n" \537"vld4.8 {" #c1 "[1]," #c2 "[1]," #c3 "[1]," #c4 "[1]}," #b2 "," #stride "\n" \538"vld4.8 {" #c1 "[2]," #c2 "[2]," #c3 "[2]," #c4 "[2]}," #b1 "," #stride "\n" \539"vld4.8 {" #c1 "[3]," #c2 "[3]," #c3 "[3]," #c4 "[3]}," #b2 "," #stride "\n" \540"vld4.8 {" #c1 "[4]," #c2 "[4]," #c3 "[4]," #c4 "[4]}," #b1 "," #stride "\n" \541"vld4.8 {" #c1 "[5]," #c2 "[5]," #c3 "[5]," #c4 "[5]}," #b2 "," #stride "\n" \542"vld4.8 {" #c1 "[6]," #c2 "[6]," #c3 "[6]," #c4 "[6]}," #b1 "," #stride "\n" \543"vld4.8 {" #c1 "[7]," #c2 "[7]," #c3 "[7]," #c4 "[7]}," #b2 "," #stride "\n"544545#define STORE8x2(c1, c2, p, stride) \546"vst2.8 {" #c1 "[0], " #c2 "[0]}," #p "," #stride " \n" \547"vst2.8 {" #c1 "[1], " #c2 "[1]}," #p "," #stride " \n" \548"vst2.8 {" #c1 "[2], " #c2 "[2]}," #p "," #stride " \n" \549"vst2.8 {" #c1 "[3], " #c2 "[3]}," #p "," #stride " \n" \550"vst2.8 {" #c1 "[4], " #c2 "[4]}," #p "," #stride " \n" \551"vst2.8 {" #c1 "[5], " #c2 "[5]}," #p "," #stride " \n" \552"vst2.8 {" #c1 "[6], " #c2 "[6]}," #p "," #stride " \n" \553"vst2.8 {" #c1 "[7], " #c2 "[7]}," #p "," #stride " \n"554555#define QRegs "q0", "q1", "q2", "q3", \556"q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"557558#define FLIP_SIGN_BIT2(a, b, s) \559"veor " #a "," #a "," #s " \n" \560"veor " #b "," #b "," #s " \n" \561562#define FLIP_SIGN_BIT4(a, b, c, d, s) \563FLIP_SIGN_BIT2(a, b, s) \564FLIP_SIGN_BIT2(c, d, s) \565566#define NEEDS_FILTER(p1, p0, q0, q1, thresh, mask) \567"vabd.u8 q15," #p0 "," #q0 " \n" /* abs(p0 - q0) */ \568"vabd.u8 q14," #p1 "," #q1 " \n" /* abs(p1 - q1) */ \569"vqadd.u8 q15, q15, q15 \n" /* abs(p0 - q0) * 2 */ \570"vshr.u8 q14, q14, #1 \n" /* abs(p1 - q1) / 2 */ \571"vqadd.u8 q15, q15, q14 \n" /* abs(p0 - q0) * 2 + abs(p1 - q1) / 2 */ \572"vdup.8 q14, " #thresh " \n" \573"vcge.u8 " #mask ", q14, q15 \n" /* mask <= thresh */574575#define GET_BASE_DELTA(p1, p0, q0, q1, o) \576"vqsub.s8 q15," #q0 "," #p0 " \n" /* (q0 - p0) */ \577"vqsub.s8 " #o "," #p1 "," #q1 " \n" /* (p1 - q1) */ \578"vqadd.s8 " #o "," #o ", q15 \n" /* (p1 - q1) + 1 * (p0 - q0) */ \579"vqadd.s8 " #o "," #o ", q15 \n" /* (p1 - q1) + 2 * (p0 - q0) */ \580"vqadd.s8 " #o "," #o ", q15 \n" /* (p1 - q1) + 3 * (p0 - q0) */581582#define DO_SIMPLE_FILTER(p0, q0, fl) \583"vmov.i8 q15, #0x03 \n" \584"vqadd.s8 q15, q15, " #fl " \n" /* filter1 = filter + 3 */ \585"vshr.s8 q15, q15, #3 \n" /* filter1 >> 3 */ \586"vqadd.s8 " #p0 "," #p0 ", q15 \n" /* p0 += filter1 */ \587\588"vmov.i8 q15, #0x04 \n" \589"vqadd.s8 q15, q15, " #fl " \n" /* filter1 = filter + 4 */ \590"vshr.s8 q15, q15, #3 \n" /* filter2 >> 3 */ \591"vqsub.s8 " #q0 "," #q0 ", q15 \n" /* q0 -= filter2 */592593// Applies filter on 2 pixels (p0 and q0)594#define DO_FILTER2(p1, p0, q0, q1, thresh) \595NEEDS_FILTER(p1, p0, q0, q1, thresh, q9) /* filter mask in q9 */ \596"vmov.i8 q10, #0x80 \n" /* sign bit */ \597FLIP_SIGN_BIT4(p1, p0, q0, q1, q10) /* convert to signed value */ \598GET_BASE_DELTA(p1, p0, q0, q1, q11) /* get filter level */ \599"vand q9, q9, q11 \n" /* apply filter mask */ \600DO_SIMPLE_FILTER(p0, q0, q9) /* apply filter */ \601FLIP_SIGN_BIT2(p0, q0, q10)602603static void SimpleVFilter16_NEON(uint8_t* p, int stride, int thresh) {604__asm__ volatile (605"sub %[p], %[p], %[stride], lsl #1 \n" // p -= 2 * stride606607"vld1.u8 {q1}, [%[p]], %[stride] \n" // p1608"vld1.u8 {q2}, [%[p]], %[stride] \n" // p0609"vld1.u8 {q3}, [%[p]], %[stride] \n" // q0610"vld1.u8 {q12}, [%[p]] \n" // q1611612DO_FILTER2(q1, q2, q3, q12, %[thresh])613614"sub %[p], %[p], %[stride], lsl #1 \n" // p -= 2 * stride615616"vst1.u8 {q2}, [%[p]], %[stride] \n" // store op0617"vst1.u8 {q3}, [%[p]] \n" // store oq0618: [p] "+r"(p)619: [stride] "r"(stride), [thresh] "r"(thresh)620: "memory", QRegs621);622}623624static void SimpleHFilter16_NEON(uint8_t* p, int stride, int thresh) {625__asm__ volatile (626"sub r4, %[p], #2 \n" // base1 = p - 2627"lsl r6, %[stride], #1 \n" // r6 = 2 * stride628"add r5, r4, %[stride] \n" // base2 = base1 + stride629630LOAD8x4(d2, d3, d4, d5, [r4], [r5], r6)631LOAD8x4(d24, d25, d26, d27, [r4], [r5], r6)632"vswp d3, d24 \n" // p1:q1 p0:q3633"vswp d5, d26 \n" // q0:q2 q1:q4634"vswp q2, q12 \n" // p1:q1 p0:q2 q0:q3 q1:q4635636DO_FILTER2(q1, q2, q12, q13, %[thresh])637638"sub %[p], %[p], #1 \n" // p - 1639640"vswp d5, d24 \n"641STORE8x2(d4, d5, [%[p]], %[stride])642STORE8x2(d24, d25, [%[p]], %[stride])643644: [p] "+r"(p)645: [stride] "r"(stride), [thresh] "r"(thresh)646: "memory", "r4", "r5", "r6", QRegs647);648}649650#undef LOAD8x4651#undef STORE8x2652653#endif // WEBP_USE_INTRINSICS654655static void SimpleVFilter16i_NEON(uint8_t* p, int stride, int thresh) {656uint32_t k;657for (k = 3; k != 0; --k) {658p += 4 * stride;659SimpleVFilter16_NEON(p, stride, thresh);660}661}662663static void SimpleHFilter16i_NEON(uint8_t* p, int stride, int thresh) {664uint32_t k;665for (k = 3; k != 0; --k) {666p += 4;667SimpleHFilter16_NEON(p, stride, thresh);668}669}670671//------------------------------------------------------------------------------672// Complex In-loop filtering (Paragraph 15.3)673674static uint8x16_t NeedsHev_NEON(const uint8x16_t p1, const uint8x16_t p0,675const uint8x16_t q0, const uint8x16_t q1,676int hev_thresh) {677const uint8x16_t hev_thresh_v = vdupq_n_u8((uint8_t)hev_thresh);678const uint8x16_t a_p1_p0 = vabdq_u8(p1, p0); // abs(p1 - p0)679const uint8x16_t a_q1_q0 = vabdq_u8(q1, q0); // abs(q1 - q0)680const uint8x16_t a_max = vmaxq_u8(a_p1_p0, a_q1_q0);681const uint8x16_t mask = vcgtq_u8(a_max, hev_thresh_v);682return mask;683}684685static uint8x16_t NeedsFilter2_NEON(const uint8x16_t p3, const uint8x16_t p2,686const uint8x16_t p1, const uint8x16_t p0,687const uint8x16_t q0, const uint8x16_t q1,688const uint8x16_t q2, const uint8x16_t q3,689int ithresh, int thresh) {690const uint8x16_t ithresh_v = vdupq_n_u8((uint8_t)ithresh);691const uint8x16_t a_p3_p2 = vabdq_u8(p3, p2); // abs(p3 - p2)692const uint8x16_t a_p2_p1 = vabdq_u8(p2, p1); // abs(p2 - p1)693const uint8x16_t a_p1_p0 = vabdq_u8(p1, p0); // abs(p1 - p0)694const uint8x16_t a_q3_q2 = vabdq_u8(q3, q2); // abs(q3 - q2)695const uint8x16_t a_q2_q1 = vabdq_u8(q2, q1); // abs(q2 - q1)696const uint8x16_t a_q1_q0 = vabdq_u8(q1, q0); // abs(q1 - q0)697const uint8x16_t max1 = vmaxq_u8(a_p3_p2, a_p2_p1);698const uint8x16_t max2 = vmaxq_u8(a_p1_p0, a_q3_q2);699const uint8x16_t max3 = vmaxq_u8(a_q2_q1, a_q1_q0);700const uint8x16_t max12 = vmaxq_u8(max1, max2);701const uint8x16_t max123 = vmaxq_u8(max12, max3);702const uint8x16_t mask2 = vcgeq_u8(ithresh_v, max123);703const uint8x16_t mask1 = NeedsFilter_NEON(p1, p0, q0, q1, thresh);704const uint8x16_t mask = vandq_u8(mask1, mask2);705return mask;706}707708// 4-points filter709710static void ApplyFilter4_NEON(711const int8x16_t p1, const int8x16_t p0,712const int8x16_t q0, const int8x16_t q1,713const int8x16_t delta0,714uint8x16_t* const op1, uint8x16_t* const op0,715uint8x16_t* const oq0, uint8x16_t* const oq1) {716const int8x16_t kCst3 = vdupq_n_s8(0x03);717const int8x16_t kCst4 = vdupq_n_s8(0x04);718const int8x16_t delta1 = vqaddq_s8(delta0, kCst4);719const int8x16_t delta2 = vqaddq_s8(delta0, kCst3);720const int8x16_t a1 = vshrq_n_s8(delta1, 3);721const int8x16_t a2 = vshrq_n_s8(delta2, 3);722const int8x16_t a3 = vrshrq_n_s8(a1, 1); // a3 = (a1 + 1) >> 1723*op0 = FlipSignBack_NEON(vqaddq_s8(p0, a2)); // clip(p0 + a2)724*oq0 = FlipSignBack_NEON(vqsubq_s8(q0, a1)); // clip(q0 - a1)725*op1 = FlipSignBack_NEON(vqaddq_s8(p1, a3)); // clip(p1 + a3)726*oq1 = FlipSignBack_NEON(vqsubq_s8(q1, a3)); // clip(q1 - a3)727}728729static void DoFilter4_NEON(730const uint8x16_t p1, const uint8x16_t p0,731const uint8x16_t q0, const uint8x16_t q1,732const uint8x16_t mask, const uint8x16_t hev_mask,733uint8x16_t* const op1, uint8x16_t* const op0,734uint8x16_t* const oq0, uint8x16_t* const oq1) {735// This is a fused version of DoFilter2() calling ApplyFilter2 directly736const int8x16_t p1s = FlipSign_NEON(p1);737int8x16_t p0s = FlipSign_NEON(p0);738int8x16_t q0s = FlipSign_NEON(q0);739const int8x16_t q1s = FlipSign_NEON(q1);740const uint8x16_t simple_lf_mask = vandq_u8(mask, hev_mask);741742// do_filter2 part (simple loopfilter on pixels with hev)743{744const int8x16_t delta = GetBaseDelta_NEON(p1s, p0s, q0s, q1s);745const int8x16_t simple_lf_delta =746vandq_s8(delta, vreinterpretq_s8_u8(simple_lf_mask));747ApplyFilter2NoFlip_NEON(p0s, q0s, simple_lf_delta, &p0s, &q0s);748}749750// do_filter4 part (complex loopfilter on pixels without hev)751{752const int8x16_t delta0 = GetBaseDelta0_NEON(p0s, q0s);753// we use: (mask & hev_mask) ^ mask = mask & !hev_mask754const uint8x16_t complex_lf_mask = veorq_u8(simple_lf_mask, mask);755const int8x16_t complex_lf_delta =756vandq_s8(delta0, vreinterpretq_s8_u8(complex_lf_mask));757ApplyFilter4_NEON(p1s, p0s, q0s, q1s, complex_lf_delta, op1, op0, oq0, oq1);758}759}760761// 6-points filter762763static void ApplyFilter6_NEON(764const int8x16_t p2, const int8x16_t p1, const int8x16_t p0,765const int8x16_t q0, const int8x16_t q1, const int8x16_t q2,766const int8x16_t delta,767uint8x16_t* const op2, uint8x16_t* const op1, uint8x16_t* const op0,768uint8x16_t* const oq0, uint8x16_t* const oq1, uint8x16_t* const oq2) {769// We have to compute: X = (9*a+63) >> 7, Y = (18*a+63)>>7, Z = (27*a+63) >> 7770// Turns out, there's a common sub-expression S=9 * a - 1 that can be used771// with the special vqrshrn_n_s16 rounding-shift-and-narrow instruction:772// X = (S + 64) >> 7, Y = (S + 32) >> 6, Z = (18 * a + S + 64) >> 7773const int8x8_t delta_lo = vget_low_s8(delta);774const int8x8_t delta_hi = vget_high_s8(delta);775const int8x8_t kCst9 = vdup_n_s8(9);776const int16x8_t kCstm1 = vdupq_n_s16(-1);777const int8x8_t kCst18 = vdup_n_s8(18);778const int16x8_t S_lo = vmlal_s8(kCstm1, kCst9, delta_lo); // S = 9 * a - 1779const int16x8_t S_hi = vmlal_s8(kCstm1, kCst9, delta_hi);780const int16x8_t Z_lo = vmlal_s8(S_lo, kCst18, delta_lo); // S + 18 * a781const int16x8_t Z_hi = vmlal_s8(S_hi, kCst18, delta_hi);782const int8x8_t a3_lo = vqrshrn_n_s16(S_lo, 7); // (9 * a + 63) >> 7783const int8x8_t a3_hi = vqrshrn_n_s16(S_hi, 7);784const int8x8_t a2_lo = vqrshrn_n_s16(S_lo, 6); // (9 * a + 31) >> 6785const int8x8_t a2_hi = vqrshrn_n_s16(S_hi, 6);786const int8x8_t a1_lo = vqrshrn_n_s16(Z_lo, 7); // (27 * a + 63) >> 7787const int8x8_t a1_hi = vqrshrn_n_s16(Z_hi, 7);788const int8x16_t a1 = vcombine_s8(a1_lo, a1_hi);789const int8x16_t a2 = vcombine_s8(a2_lo, a2_hi);790const int8x16_t a3 = vcombine_s8(a3_lo, a3_hi);791792*op0 = FlipSignBack_NEON(vqaddq_s8(p0, a1)); // clip(p0 + a1)793*oq0 = FlipSignBack_NEON(vqsubq_s8(q0, a1)); // clip(q0 - q1)794*oq1 = FlipSignBack_NEON(vqsubq_s8(q1, a2)); // clip(q1 - a2)795*op1 = FlipSignBack_NEON(vqaddq_s8(p1, a2)); // clip(p1 + a2)796*oq2 = FlipSignBack_NEON(vqsubq_s8(q2, a3)); // clip(q2 - a3)797*op2 = FlipSignBack_NEON(vqaddq_s8(p2, a3)); // clip(p2 + a3)798}799800static void DoFilter6_NEON(801const uint8x16_t p2, const uint8x16_t p1, const uint8x16_t p0,802const uint8x16_t q0, const uint8x16_t q1, const uint8x16_t q2,803const uint8x16_t mask, const uint8x16_t hev_mask,804uint8x16_t* const op2, uint8x16_t* const op1, uint8x16_t* const op0,805uint8x16_t* const oq0, uint8x16_t* const oq1, uint8x16_t* const oq2) {806// This is a fused version of DoFilter2() calling ApplyFilter2 directly807const int8x16_t p2s = FlipSign_NEON(p2);808const int8x16_t p1s = FlipSign_NEON(p1);809int8x16_t p0s = FlipSign_NEON(p0);810int8x16_t q0s = FlipSign_NEON(q0);811const int8x16_t q1s = FlipSign_NEON(q1);812const int8x16_t q2s = FlipSign_NEON(q2);813const uint8x16_t simple_lf_mask = vandq_u8(mask, hev_mask);814const int8x16_t delta0 = GetBaseDelta_NEON(p1s, p0s, q0s, q1s);815816// do_filter2 part (simple loopfilter on pixels with hev)817{818const int8x16_t simple_lf_delta =819vandq_s8(delta0, vreinterpretq_s8_u8(simple_lf_mask));820ApplyFilter2NoFlip_NEON(p0s, q0s, simple_lf_delta, &p0s, &q0s);821}822823// do_filter6 part (complex loopfilter on pixels without hev)824{825// we use: (mask & hev_mask) ^ mask = mask & !hev_mask826const uint8x16_t complex_lf_mask = veorq_u8(simple_lf_mask, mask);827const int8x16_t complex_lf_delta =828vandq_s8(delta0, vreinterpretq_s8_u8(complex_lf_mask));829ApplyFilter6_NEON(p2s, p1s, p0s, q0s, q1s, q2s, complex_lf_delta,830op2, op1, op0, oq0, oq1, oq2);831}832}833834// on macroblock edges835836static void VFilter16_NEON(uint8_t* p, int stride,837int thresh, int ithresh, int hev_thresh) {838uint8x16_t p3, p2, p1, p0, q0, q1, q2, q3;839Load16x8_NEON(p, stride, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);840{841const uint8x16_t mask = NeedsFilter2_NEON(p3, p2, p1, p0, q0, q1, q2, q3,842ithresh, thresh);843const uint8x16_t hev_mask = NeedsHev_NEON(p1, p0, q0, q1, hev_thresh);844uint8x16_t op2, op1, op0, oq0, oq1, oq2;845DoFilter6_NEON(p2, p1, p0, q0, q1, q2, mask, hev_mask,846&op2, &op1, &op0, &oq0, &oq1, &oq2);847Store16x2_NEON(op2, op1, p - 2 * stride, stride);848Store16x2_NEON(op0, oq0, p + 0 * stride, stride);849Store16x2_NEON(oq1, oq2, p + 2 * stride, stride);850}851}852853static void HFilter16_NEON(uint8_t* p, int stride,854int thresh, int ithresh, int hev_thresh) {855uint8x16_t p3, p2, p1, p0, q0, q1, q2, q3;856Load8x16_NEON(p, stride, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);857{858const uint8x16_t mask = NeedsFilter2_NEON(p3, p2, p1, p0, q0, q1, q2, q3,859ithresh, thresh);860const uint8x16_t hev_mask = NeedsHev_NEON(p1, p0, q0, q1, hev_thresh);861uint8x16_t op2, op1, op0, oq0, oq1, oq2;862DoFilter6_NEON(p2, p1, p0, q0, q1, q2, mask, hev_mask,863&op2, &op1, &op0, &oq0, &oq1, &oq2);864Store2x16_NEON(op2, op1, p - 2, stride);865Store2x16_NEON(op0, oq0, p + 0, stride);866Store2x16_NEON(oq1, oq2, p + 2, stride);867}868}869870// on three inner edges871static void VFilter16i_NEON(uint8_t* p, int stride,872int thresh, int ithresh, int hev_thresh) {873uint32_t k;874uint8x16_t p3, p2, p1, p0;875Load16x4_NEON(p + 2 * stride, stride, &p3, &p2, &p1, &p0);876for (k = 3; k != 0; --k) {877uint8x16_t q0, q1, q2, q3;878p += 4 * stride;879Load16x4_NEON(p + 2 * stride, stride, &q0, &q1, &q2, &q3);880{881const uint8x16_t mask =882NeedsFilter2_NEON(p3, p2, p1, p0, q0, q1, q2, q3, ithresh, thresh);883const uint8x16_t hev_mask = NeedsHev_NEON(p1, p0, q0, q1, hev_thresh);884// p3 and p2 are not just temporary variables here: they will be885// re-used for next span. And q2/q3 will become p1/p0 accordingly.886DoFilter4_NEON(p1, p0, q0, q1, mask, hev_mask, &p1, &p0, &p3, &p2);887Store16x4_NEON(p1, p0, p3, p2, p, stride);888p1 = q2;889p0 = q3;890}891}892}893894#if !defined(WORK_AROUND_GCC)895static void HFilter16i_NEON(uint8_t* p, int stride,896int thresh, int ithresh, int hev_thresh) {897uint32_t k;898uint8x16_t p3, p2, p1, p0;899Load4x16_NEON(p + 2, stride, &p3, &p2, &p1, &p0);900for (k = 3; k != 0; --k) {901uint8x16_t q0, q1, q2, q3;902p += 4;903Load4x16_NEON(p + 2, stride, &q0, &q1, &q2, &q3);904{905const uint8x16_t mask =906NeedsFilter2_NEON(p3, p2, p1, p0, q0, q1, q2, q3, ithresh, thresh);907const uint8x16_t hev_mask = NeedsHev_NEON(p1, p0, q0, q1, hev_thresh);908DoFilter4_NEON(p1, p0, q0, q1, mask, hev_mask, &p1, &p0, &p3, &p2);909Store4x16_NEON(p1, p0, p3, p2, p, stride);910p1 = q2;911p0 = q3;912}913}914}915#endif // !WORK_AROUND_GCC916917// 8-pixels wide variant, for chroma filtering918static void VFilter8_NEON(uint8_t* u, uint8_t* v, int stride,919int thresh, int ithresh, int hev_thresh) {920uint8x16_t p3, p2, p1, p0, q0, q1, q2, q3;921Load8x8x2_NEON(u, v, stride, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);922{923const uint8x16_t mask = NeedsFilter2_NEON(p3, p2, p1, p0, q0, q1, q2, q3,924ithresh, thresh);925const uint8x16_t hev_mask = NeedsHev_NEON(p1, p0, q0, q1, hev_thresh);926uint8x16_t op2, op1, op0, oq0, oq1, oq2;927DoFilter6_NEON(p2, p1, p0, q0, q1, q2, mask, hev_mask,928&op2, &op1, &op0, &oq0, &oq1, &oq2);929Store8x2x2_NEON(op2, op1, u - 2 * stride, v - 2 * stride, stride);930Store8x2x2_NEON(op0, oq0, u + 0 * stride, v + 0 * stride, stride);931Store8x2x2_NEON(oq1, oq2, u + 2 * stride, v + 2 * stride, stride);932}933}934static void VFilter8i_NEON(uint8_t* u, uint8_t* v, int stride,935int thresh, int ithresh, int hev_thresh) {936uint8x16_t p3, p2, p1, p0, q0, q1, q2, q3;937u += 4 * stride;938v += 4 * stride;939Load8x8x2_NEON(u, v, stride, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);940{941const uint8x16_t mask = NeedsFilter2_NEON(p3, p2, p1, p0, q0, q1, q2, q3,942ithresh, thresh);943const uint8x16_t hev_mask = NeedsHev_NEON(p1, p0, q0, q1, hev_thresh);944uint8x16_t op1, op0, oq0, oq1;945DoFilter4_NEON(p1, p0, q0, q1, mask, hev_mask, &op1, &op0, &oq0, &oq1);946Store8x4x2_NEON(op1, op0, oq0, oq1, u, v, stride);947}948}949950#if !defined(WORK_AROUND_GCC)951static void HFilter8_NEON(uint8_t* u, uint8_t* v, int stride,952int thresh, int ithresh, int hev_thresh) {953uint8x16_t p3, p2, p1, p0, q0, q1, q2, q3;954Load8x8x2T_NEON(u, v, stride, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);955{956const uint8x16_t mask = NeedsFilter2_NEON(p3, p2, p1, p0, q0, q1, q2, q3,957ithresh, thresh);958const uint8x16_t hev_mask = NeedsHev_NEON(p1, p0, q0, q1, hev_thresh);959uint8x16_t op2, op1, op0, oq0, oq1, oq2;960DoFilter6_NEON(p2, p1, p0, q0, q1, q2, mask, hev_mask,961&op2, &op1, &op0, &oq0, &oq1, &oq2);962Store6x8x2_NEON(op2, op1, op0, oq0, oq1, oq2, u, v, stride);963}964}965966static void HFilter8i_NEON(uint8_t* u, uint8_t* v, int stride,967int thresh, int ithresh, int hev_thresh) {968uint8x16_t p3, p2, p1, p0, q0, q1, q2, q3;969u += 4;970v += 4;971Load8x8x2T_NEON(u, v, stride, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);972{973const uint8x16_t mask = NeedsFilter2_NEON(p3, p2, p1, p0, q0, q1, q2, q3,974ithresh, thresh);975const uint8x16_t hev_mask = NeedsHev_NEON(p1, p0, q0, q1, hev_thresh);976uint8x16_t op1, op0, oq0, oq1;977DoFilter4_NEON(p1, p0, q0, q1, mask, hev_mask, &op1, &op0, &oq0, &oq1);978Store4x8x2_NEON(op1, op0, oq0, oq1, u, v, stride);979}980}981#endif // !WORK_AROUND_GCC982983//-----------------------------------------------------------------------------984// Inverse transforms (Paragraph 14.4)985986// Technically these are unsigned but vqdmulh is only available in signed.987// vqdmulh returns high half (effectively >> 16) but also doubles the value,988// changing the >> 16 to >> 15 and requiring an additional >> 1.989// We use this to our advantage with kC2. The canonical value is 35468.990// However, the high bit is set so treating it as signed will give incorrect991// results. We avoid this by down shifting by 1 here to clear the highest bit.992// Combined with the doubling effect of vqdmulh we get >> 16.993// This can not be applied to kC1 because the lowest bit is set. Down shifting994// the constant would reduce precision.995996// libwebp uses a trick to avoid some extra addition that libvpx does.997// Instead of:998// temp2 = ip[12] + ((ip[12] * cospi8sqrt2minus1) >> 16);999// libwebp adds 1 << 16 to cospi8sqrt2minus1 (kC1). However, this causes the1000// same issue with kC1 and vqdmulh that we work around by down shifting kC210011002static const int16_t kC1 = 20091;1003static const int16_t kC2 = 17734; // half of kC2, actually. See comment above.10041005#if defined(WEBP_USE_INTRINSICS)1006static WEBP_INLINE void Transpose8x2_NEON(const int16x8_t in0,1007const int16x8_t in1,1008int16x8x2_t* const out) {1009// a0 a1 a2 a3 | b0 b1 b2 b3 => a0 b0 c0 d0 | a1 b1 c1 d11010// c0 c1 c2 c3 | d0 d1 d2 d3 a2 b2 c2 d2 | a3 b3 c3 d31011const int16x8x2_t tmp0 = vzipq_s16(in0, in1); // a0 c0 a1 c1 a2 c2 ...1012// b0 d0 b1 d1 b2 d2 ...1013*out = vzipq_s16(tmp0.val[0], tmp0.val[1]);1014}10151016static WEBP_INLINE void TransformPass_NEON(int16x8x2_t* const rows) {1017// {rows} = in0 | in41018// in8 | in121019// B1 = in4 | in121020const int16x8_t B1 =1021vcombine_s16(vget_high_s16(rows->val[0]), vget_high_s16(rows->val[1]));1022// C0 = kC1 * in4 | kC1 * in121023// C1 = kC2 * in4 | kC2 * in121024const int16x8_t C0 = vsraq_n_s16(B1, vqdmulhq_n_s16(B1, kC1), 1);1025const int16x8_t C1 = vqdmulhq_n_s16(B1, kC2);1026const int16x4_t a = vqadd_s16(vget_low_s16(rows->val[0]),1027vget_low_s16(rows->val[1])); // in0 + in81028const int16x4_t b = vqsub_s16(vget_low_s16(rows->val[0]),1029vget_low_s16(rows->val[1])); // in0 - in81030// c = kC2 * in4 - kC1 * in121031// d = kC1 * in4 + kC2 * in121032const int16x4_t c = vqsub_s16(vget_low_s16(C1), vget_high_s16(C0));1033const int16x4_t d = vqadd_s16(vget_low_s16(C0), vget_high_s16(C1));1034const int16x8_t D0 = vcombine_s16(a, b); // D0 = a | b1035const int16x8_t D1 = vcombine_s16(d, c); // D1 = d | c1036const int16x8_t E0 = vqaddq_s16(D0, D1); // a+d | b+c1037const int16x8_t E_tmp = vqsubq_s16(D0, D1); // a-d | b-c1038const int16x8_t E1 = vcombine_s16(vget_high_s16(E_tmp), vget_low_s16(E_tmp));1039Transpose8x2_NEON(E0, E1, rows);1040}10411042static void TransformOne_NEON(const int16_t* in, uint8_t* dst) {1043int16x8x2_t rows;1044INIT_VECTOR2(rows, vld1q_s16(in + 0), vld1q_s16(in + 8));1045TransformPass_NEON(&rows);1046TransformPass_NEON(&rows);1047Add4x4_NEON(rows.val[0], rows.val[1], dst);1048}10491050#else10511052static void TransformOne_NEON(const int16_t* in, uint8_t* dst) {1053const int kBPS = BPS;1054// kC1, kC2. Padded because vld1.16 loads 8 bytes1055const int16_t constants[4] = { kC1, kC2, 0, 0 };1056/* Adapted from libvpx: vp8/common/arm/neon/shortidct4x4llm_neon.asm */1057__asm__ volatile (1058"vld1.16 {q1, q2}, [%[in]] \n"1059"vld1.16 {d0}, [%[constants]] \n"10601061/* d2: in[0]1062* d3: in[8]1063* d4: in[4]1064* d5: in[12]1065*/1066"vswp d3, d4 \n"10671068/* q8 = {in[4], in[12]} * kC1 * 2 >> 161069* q9 = {in[4], in[12]} * kC2 >> 161070*/1071"vqdmulh.s16 q8, q2, d0[0] \n"1072"vqdmulh.s16 q9, q2, d0[1] \n"10731074/* d22 = a = in[0] + in[8]1075* d23 = b = in[0] - in[8]1076*/1077"vqadd.s16 d22, d2, d3 \n"1078"vqsub.s16 d23, d2, d3 \n"10791080/* The multiplication should be x * kC1 >> 161081* However, with vqdmulh we get x * kC1 * 2 >> 161082* (multiply, double, return high half)1083* We avoided this in kC2 by pre-shifting the constant.1084* q8 = in[4]/[12] * kC1 >> 161085*/1086"vshr.s16 q8, q8, #1 \n"10871088/* Add {in[4], in[12]} back after the multiplication. This is handled by1089* adding 1 << 16 to kC1 in the libwebp C code.1090*/1091"vqadd.s16 q8, q2, q8 \n"10921093/* d20 = c = in[4]*kC2 - in[12]*kC11094* d21 = d = in[4]*kC1 + in[12]*kC21095*/1096"vqsub.s16 d20, d18, d17 \n"1097"vqadd.s16 d21, d19, d16 \n"10981099/* d2 = tmp[0] = a + d1100* d3 = tmp[1] = b + c1101* d4 = tmp[2] = b - c1102* d5 = tmp[3] = a - d1103*/1104"vqadd.s16 d2, d22, d21 \n"1105"vqadd.s16 d3, d23, d20 \n"1106"vqsub.s16 d4, d23, d20 \n"1107"vqsub.s16 d5, d22, d21 \n"11081109"vzip.16 q1, q2 \n"1110"vzip.16 q1, q2 \n"11111112"vswp d3, d4 \n"11131114/* q8 = {tmp[4], tmp[12]} * kC1 * 2 >> 161115* q9 = {tmp[4], tmp[12]} * kC2 >> 161116*/1117"vqdmulh.s16 q8, q2, d0[0] \n"1118"vqdmulh.s16 q9, q2, d0[1] \n"11191120/* d22 = a = tmp[0] + tmp[8]1121* d23 = b = tmp[0] - tmp[8]1122*/1123"vqadd.s16 d22, d2, d3 \n"1124"vqsub.s16 d23, d2, d3 \n"11251126/* See long winded explanations prior */1127"vshr.s16 q8, q8, #1 \n"1128"vqadd.s16 q8, q2, q8 \n"11291130/* d20 = c = in[4]*kC2 - in[12]*kC11131* d21 = d = in[4]*kC1 + in[12]*kC21132*/1133"vqsub.s16 d20, d18, d17 \n"1134"vqadd.s16 d21, d19, d16 \n"11351136/* d2 = tmp[0] = a + d1137* d3 = tmp[1] = b + c1138* d4 = tmp[2] = b - c1139* d5 = tmp[3] = a - d1140*/1141"vqadd.s16 d2, d22, d21 \n"1142"vqadd.s16 d3, d23, d20 \n"1143"vqsub.s16 d4, d23, d20 \n"1144"vqsub.s16 d5, d22, d21 \n"11451146"vld1.32 d6[0], [%[dst]], %[kBPS] \n"1147"vld1.32 d6[1], [%[dst]], %[kBPS] \n"1148"vld1.32 d7[0], [%[dst]], %[kBPS] \n"1149"vld1.32 d7[1], [%[dst]], %[kBPS] \n"11501151"sub %[dst], %[dst], %[kBPS], lsl #2 \n"11521153/* (val) + 4 >> 3 */1154"vrshr.s16 d2, d2, #3 \n"1155"vrshr.s16 d3, d3, #3 \n"1156"vrshr.s16 d4, d4, #3 \n"1157"vrshr.s16 d5, d5, #3 \n"11581159"vzip.16 q1, q2 \n"1160"vzip.16 q1, q2 \n"11611162/* Must accumulate before saturating */1163"vmovl.u8 q8, d6 \n"1164"vmovl.u8 q9, d7 \n"11651166"vqadd.s16 q1, q1, q8 \n"1167"vqadd.s16 q2, q2, q9 \n"11681169"vqmovun.s16 d0, q1 \n"1170"vqmovun.s16 d1, q2 \n"11711172"vst1.32 d0[0], [%[dst]], %[kBPS] \n"1173"vst1.32 d0[1], [%[dst]], %[kBPS] \n"1174"vst1.32 d1[0], [%[dst]], %[kBPS] \n"1175"vst1.32 d1[1], [%[dst]] \n"11761177: [in] "+r"(in), [dst] "+r"(dst) /* modified registers */1178: [kBPS] "r"(kBPS), [constants] "r"(constants) /* constants */1179: "memory", "q0", "q1", "q2", "q8", "q9", "q10", "q11" /* clobbered */1180);1181}11821183#endif // WEBP_USE_INTRINSICS11841185static void TransformTwo_NEON(const int16_t* in, uint8_t* dst, int do_two) {1186TransformOne_NEON(in, dst);1187if (do_two) {1188TransformOne_NEON(in + 16, dst + 4);1189}1190}11911192static void TransformDC_NEON(const int16_t* in, uint8_t* dst) {1193const int16x8_t DC = vdupq_n_s16(in[0]);1194Add4x4_NEON(DC, DC, dst);1195}11961197//------------------------------------------------------------------------------11981199#define STORE_WHT(dst, col, rows) do { \1200*dst = vgetq_lane_s32(rows.val[0], col); (dst) += 16; \1201*dst = vgetq_lane_s32(rows.val[1], col); (dst) += 16; \1202*dst = vgetq_lane_s32(rows.val[2], col); (dst) += 16; \1203*dst = vgetq_lane_s32(rows.val[3], col); (dst) += 16; \1204} while (0)12051206static void TransformWHT_NEON(const int16_t* in, int16_t* out) {1207int32x4x4_t tmp;12081209{1210// Load the source.1211const int16x4_t in00_03 = vld1_s16(in + 0);1212const int16x4_t in04_07 = vld1_s16(in + 4);1213const int16x4_t in08_11 = vld1_s16(in + 8);1214const int16x4_t in12_15 = vld1_s16(in + 12);1215const int32x4_t a0 = vaddl_s16(in00_03, in12_15); // in[0..3] + in[12..15]1216const int32x4_t a1 = vaddl_s16(in04_07, in08_11); // in[4..7] + in[8..11]1217const int32x4_t a2 = vsubl_s16(in04_07, in08_11); // in[4..7] - in[8..11]1218const int32x4_t a3 = vsubl_s16(in00_03, in12_15); // in[0..3] - in[12..15]1219tmp.val[0] = vaddq_s32(a0, a1);1220tmp.val[1] = vaddq_s32(a3, a2);1221tmp.val[2] = vsubq_s32(a0, a1);1222tmp.val[3] = vsubq_s32(a3, a2);1223// Arrange the temporary results column-wise.1224tmp = Transpose4x4_NEON(tmp);1225}12261227{1228const int32x4_t kCst3 = vdupq_n_s32(3);1229const int32x4_t dc = vaddq_s32(tmp.val[0], kCst3); // add rounder1230const int32x4_t a0 = vaddq_s32(dc, tmp.val[3]);1231const int32x4_t a1 = vaddq_s32(tmp.val[1], tmp.val[2]);1232const int32x4_t a2 = vsubq_s32(tmp.val[1], tmp.val[2]);1233const int32x4_t a3 = vsubq_s32(dc, tmp.val[3]);12341235tmp.val[0] = vaddq_s32(a0, a1);1236tmp.val[1] = vaddq_s32(a3, a2);1237tmp.val[2] = vsubq_s32(a0, a1);1238tmp.val[3] = vsubq_s32(a3, a2);12391240// right shift the results by 3.1241tmp.val[0] = vshrq_n_s32(tmp.val[0], 3);1242tmp.val[1] = vshrq_n_s32(tmp.val[1], 3);1243tmp.val[2] = vshrq_n_s32(tmp.val[2], 3);1244tmp.val[3] = vshrq_n_s32(tmp.val[3], 3);12451246STORE_WHT(out, 0, tmp);1247STORE_WHT(out, 1, tmp);1248STORE_WHT(out, 2, tmp);1249STORE_WHT(out, 3, tmp);1250}1251}12521253#undef STORE_WHT12541255//------------------------------------------------------------------------------12561257#define MUL(a, b) (((a) * (b)) >> 16)1258static void TransformAC3_NEON(const int16_t* in, uint8_t* dst) {1259static const int kC1_full = 20091 + (1 << 16);1260static const int kC2_full = 35468;1261const int16x4_t A = vld1_dup_s16(in);1262const int16x4_t c4 = vdup_n_s16(MUL(in[4], kC2_full));1263const int16x4_t d4 = vdup_n_s16(MUL(in[4], kC1_full));1264const int c1 = MUL(in[1], kC2_full);1265const int d1 = MUL(in[1], kC1_full);1266const uint64_t cd = (uint64_t)( d1 & 0xffff) << 0 |1267(uint64_t)( c1 & 0xffff) << 16 |1268(uint64_t)(-c1 & 0xffff) << 32 |1269(uint64_t)(-d1 & 0xffff) << 48;1270const int16x4_t CD = vcreate_s16(cd);1271const int16x4_t B = vqadd_s16(A, CD);1272const int16x8_t m0_m1 = vcombine_s16(vqadd_s16(B, d4), vqadd_s16(B, c4));1273const int16x8_t m2_m3 = vcombine_s16(vqsub_s16(B, c4), vqsub_s16(B, d4));1274Add4x4_NEON(m0_m1, m2_m3, dst);1275}1276#undef MUL12771278//------------------------------------------------------------------------------1279// 4x412801281static void DC4_NEON(uint8_t* dst) { // DC1282const uint8x8_t A = vld1_u8(dst - BPS); // top row1283const uint16x4_t p0 = vpaddl_u8(A); // cascading summation of the top1284const uint16x4_t p1 = vpadd_u16(p0, p0);1285const uint16x8_t L0 = vmovl_u8(vld1_u8(dst + 0 * BPS - 1));1286const uint16x8_t L1 = vmovl_u8(vld1_u8(dst + 1 * BPS - 1));1287const uint16x8_t L2 = vmovl_u8(vld1_u8(dst + 2 * BPS - 1));1288const uint16x8_t L3 = vmovl_u8(vld1_u8(dst + 3 * BPS - 1));1289const uint16x8_t s0 = vaddq_u16(L0, L1);1290const uint16x8_t s1 = vaddq_u16(L2, L3);1291const uint16x8_t s01 = vaddq_u16(s0, s1);1292const uint16x8_t sum = vaddq_u16(s01, vcombine_u16(p1, p1));1293const uint8x8_t dc0 = vrshrn_n_u16(sum, 3); // (sum + 4) >> 31294const uint8x8_t dc = vdup_lane_u8(dc0, 0);1295int i;1296for (i = 0; i < 4; ++i) {1297vst1_lane_u32((uint32_t*)(dst + i * BPS), vreinterpret_u32_u8(dc), 0);1298}1299}13001301// TrueMotion (4x4 + 8x8)1302static WEBP_INLINE void TrueMotion_NEON(uint8_t* dst, int size) {1303const uint8x8_t TL = vld1_dup_u8(dst - BPS - 1); // top-left pixel 'A[-1]'1304const uint8x8_t T = vld1_u8(dst - BPS); // top row 'A[0..3]'1305const int16x8_t d = vreinterpretq_s16_u16(vsubl_u8(T, TL)); // A[c] - A[-1]1306int y;1307for (y = 0; y < size; y += 4) {1308// left edge1309const int16x8_t L0 = ConvertU8ToS16_NEON(vld1_dup_u8(dst + 0 * BPS - 1));1310const int16x8_t L1 = ConvertU8ToS16_NEON(vld1_dup_u8(dst + 1 * BPS - 1));1311const int16x8_t L2 = ConvertU8ToS16_NEON(vld1_dup_u8(dst + 2 * BPS - 1));1312const int16x8_t L3 = ConvertU8ToS16_NEON(vld1_dup_u8(dst + 3 * BPS - 1));1313const int16x8_t r0 = vaddq_s16(L0, d); // L[r] + A[c] - A[-1]1314const int16x8_t r1 = vaddq_s16(L1, d);1315const int16x8_t r2 = vaddq_s16(L2, d);1316const int16x8_t r3 = vaddq_s16(L3, d);1317// Saturate and store the result.1318const uint32x2_t r0_u32 = vreinterpret_u32_u8(vqmovun_s16(r0));1319const uint32x2_t r1_u32 = vreinterpret_u32_u8(vqmovun_s16(r1));1320const uint32x2_t r2_u32 = vreinterpret_u32_u8(vqmovun_s16(r2));1321const uint32x2_t r3_u32 = vreinterpret_u32_u8(vqmovun_s16(r3));1322if (size == 4) {1323vst1_lane_u32((uint32_t*)(dst + 0 * BPS), r0_u32, 0);1324vst1_lane_u32((uint32_t*)(dst + 1 * BPS), r1_u32, 0);1325vst1_lane_u32((uint32_t*)(dst + 2 * BPS), r2_u32, 0);1326vst1_lane_u32((uint32_t*)(dst + 3 * BPS), r3_u32, 0);1327} else {1328vst1_u32((uint32_t*)(dst + 0 * BPS), r0_u32);1329vst1_u32((uint32_t*)(dst + 1 * BPS), r1_u32);1330vst1_u32((uint32_t*)(dst + 2 * BPS), r2_u32);1331vst1_u32((uint32_t*)(dst + 3 * BPS), r3_u32);1332}1333dst += 4 * BPS;1334}1335}13361337static void TM4_NEON(uint8_t* dst) { TrueMotion_NEON(dst, 4); }13381339static void VE4_NEON(uint8_t* dst) { // vertical1340// NB: avoid vld1_u64 here as an alignment hint may be added -> SIGBUS.1341const uint64x1_t A0 = vreinterpret_u64_u8(vld1_u8(dst - BPS - 1)); // top row1342const uint64x1_t A1 = vshr_n_u64(A0, 8);1343const uint64x1_t A2 = vshr_n_u64(A0, 16);1344const uint8x8_t ABCDEFGH = vreinterpret_u8_u64(A0);1345const uint8x8_t BCDEFGH0 = vreinterpret_u8_u64(A1);1346const uint8x8_t CDEFGH00 = vreinterpret_u8_u64(A2);1347const uint8x8_t b = vhadd_u8(ABCDEFGH, CDEFGH00);1348const uint8x8_t avg = vrhadd_u8(b, BCDEFGH0);1349int i;1350for (i = 0; i < 4; ++i) {1351vst1_lane_u32((uint32_t*)(dst + i * BPS), vreinterpret_u32_u8(avg), 0);1352}1353}13541355static void RD4_NEON(uint8_t* dst) { // Down-right1356const uint8x8_t XABCD_u8 = vld1_u8(dst - BPS - 1);1357const uint64x1_t XABCD = vreinterpret_u64_u8(XABCD_u8);1358const uint64x1_t ____XABC = vshl_n_u64(XABCD, 32);1359const uint32_t I = dst[-1 + 0 * BPS];1360const uint32_t J = dst[-1 + 1 * BPS];1361const uint32_t K = dst[-1 + 2 * BPS];1362const uint32_t L = dst[-1 + 3 * BPS];1363const uint64x1_t LKJI____ = vcreate_u64(L | (K << 8) | (J << 16) | (I << 24));1364const uint64x1_t LKJIXABC = vorr_u64(LKJI____, ____XABC);1365const uint8x8_t KJIXABC_ = vreinterpret_u8_u64(vshr_n_u64(LKJIXABC, 8));1366const uint8x8_t JIXABC__ = vreinterpret_u8_u64(vshr_n_u64(LKJIXABC, 16));1367const uint8_t D = vget_lane_u8(XABCD_u8, 4);1368const uint8x8_t JIXABCD_ = vset_lane_u8(D, JIXABC__, 6);1369const uint8x8_t LKJIXABC_u8 = vreinterpret_u8_u64(LKJIXABC);1370const uint8x8_t avg1 = vhadd_u8(JIXABCD_, LKJIXABC_u8);1371const uint8x8_t avg2 = vrhadd_u8(avg1, KJIXABC_);1372const uint64x1_t avg2_u64 = vreinterpret_u64_u8(avg2);1373const uint32x2_t r3 = vreinterpret_u32_u8(avg2);1374const uint32x2_t r2 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 8));1375const uint32x2_t r1 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 16));1376const uint32x2_t r0 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 24));1377vst1_lane_u32((uint32_t*)(dst + 0 * BPS), r0, 0);1378vst1_lane_u32((uint32_t*)(dst + 1 * BPS), r1, 0);1379vst1_lane_u32((uint32_t*)(dst + 2 * BPS), r2, 0);1380vst1_lane_u32((uint32_t*)(dst + 3 * BPS), r3, 0);1381}13821383static void LD4_NEON(uint8_t* dst) { // Down-left1384// Note using the same shift trick as VE4() is slower here.1385const uint8x8_t ABCDEFGH = vld1_u8(dst - BPS + 0);1386const uint8x8_t BCDEFGH0 = vld1_u8(dst - BPS + 1);1387const uint8x8_t CDEFGH00 = vld1_u8(dst - BPS + 2);1388const uint8x8_t CDEFGHH0 = vset_lane_u8(dst[-BPS + 7], CDEFGH00, 6);1389const uint8x8_t avg1 = vhadd_u8(ABCDEFGH, CDEFGHH0);1390const uint8x8_t avg2 = vrhadd_u8(avg1, BCDEFGH0);1391const uint64x1_t avg2_u64 = vreinterpret_u64_u8(avg2);1392const uint32x2_t r0 = vreinterpret_u32_u8(avg2);1393const uint32x2_t r1 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 8));1394const uint32x2_t r2 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 16));1395const uint32x2_t r3 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 24));1396vst1_lane_u32((uint32_t*)(dst + 0 * BPS), r0, 0);1397vst1_lane_u32((uint32_t*)(dst + 1 * BPS), r1, 0);1398vst1_lane_u32((uint32_t*)(dst + 2 * BPS), r2, 0);1399vst1_lane_u32((uint32_t*)(dst + 3 * BPS), r3, 0);1400}14011402//------------------------------------------------------------------------------1403// Chroma14041405static void VE8uv_NEON(uint8_t* dst) { // vertical1406const uint8x8_t top = vld1_u8(dst - BPS);1407int j;1408for (j = 0; j < 8; ++j) {1409vst1_u8(dst + j * BPS, top);1410}1411}14121413static void HE8uv_NEON(uint8_t* dst) { // horizontal1414int j;1415for (j = 0; j < 8; ++j) {1416const uint8x8_t left = vld1_dup_u8(dst - 1);1417vst1_u8(dst, left);1418dst += BPS;1419}1420}14211422static WEBP_INLINE void DC8_NEON(uint8_t* dst, int do_top, int do_left) {1423uint16x8_t sum_top;1424uint16x8_t sum_left;1425uint8x8_t dc0;14261427if (do_top) {1428const uint8x8_t A = vld1_u8(dst - BPS); // top row1429const uint16x4_t p0 = vpaddl_u8(A); // cascading summation of the top1430const uint16x4_t p1 = vpadd_u16(p0, p0);1431const uint16x4_t p2 = vpadd_u16(p1, p1);1432sum_top = vcombine_u16(p2, p2);1433}14341435if (do_left) {1436const uint16x8_t L0 = vmovl_u8(vld1_u8(dst + 0 * BPS - 1));1437const uint16x8_t L1 = vmovl_u8(vld1_u8(dst + 1 * BPS - 1));1438const uint16x8_t L2 = vmovl_u8(vld1_u8(dst + 2 * BPS - 1));1439const uint16x8_t L3 = vmovl_u8(vld1_u8(dst + 3 * BPS - 1));1440const uint16x8_t L4 = vmovl_u8(vld1_u8(dst + 4 * BPS - 1));1441const uint16x8_t L5 = vmovl_u8(vld1_u8(dst + 5 * BPS - 1));1442const uint16x8_t L6 = vmovl_u8(vld1_u8(dst + 6 * BPS - 1));1443const uint16x8_t L7 = vmovl_u8(vld1_u8(dst + 7 * BPS - 1));1444const uint16x8_t s0 = vaddq_u16(L0, L1);1445const uint16x8_t s1 = vaddq_u16(L2, L3);1446const uint16x8_t s2 = vaddq_u16(L4, L5);1447const uint16x8_t s3 = vaddq_u16(L6, L7);1448const uint16x8_t s01 = vaddq_u16(s0, s1);1449const uint16x8_t s23 = vaddq_u16(s2, s3);1450sum_left = vaddq_u16(s01, s23);1451}14521453if (do_top && do_left) {1454const uint16x8_t sum = vaddq_u16(sum_left, sum_top);1455dc0 = vrshrn_n_u16(sum, 4);1456} else if (do_top) {1457dc0 = vrshrn_n_u16(sum_top, 3);1458} else if (do_left) {1459dc0 = vrshrn_n_u16(sum_left, 3);1460} else {1461dc0 = vdup_n_u8(0x80);1462}14631464{1465const uint8x8_t dc = vdup_lane_u8(dc0, 0);1466int i;1467for (i = 0; i < 8; ++i) {1468vst1_u32((uint32_t*)(dst + i * BPS), vreinterpret_u32_u8(dc));1469}1470}1471}14721473static void DC8uv_NEON(uint8_t* dst) { DC8_NEON(dst, 1, 1); }1474static void DC8uvNoTop_NEON(uint8_t* dst) { DC8_NEON(dst, 0, 1); }1475static void DC8uvNoLeft_NEON(uint8_t* dst) { DC8_NEON(dst, 1, 0); }1476static void DC8uvNoTopLeft_NEON(uint8_t* dst) { DC8_NEON(dst, 0, 0); }14771478static void TM8uv_NEON(uint8_t* dst) { TrueMotion_NEON(dst, 8); }14791480//------------------------------------------------------------------------------1481// 16x1614821483static void VE16_NEON(uint8_t* dst) { // vertical1484const uint8x16_t top = vld1q_u8(dst - BPS);1485int j;1486for (j = 0; j < 16; ++j) {1487vst1q_u8(dst + j * BPS, top);1488}1489}14901491static void HE16_NEON(uint8_t* dst) { // horizontal1492int j;1493for (j = 0; j < 16; ++j) {1494const uint8x16_t left = vld1q_dup_u8(dst - 1);1495vst1q_u8(dst, left);1496dst += BPS;1497}1498}14991500static WEBP_INLINE void DC16_NEON(uint8_t* dst, int do_top, int do_left) {1501uint16x8_t sum_top;1502uint16x8_t sum_left;1503uint8x8_t dc0;15041505if (do_top) {1506const uint8x16_t A = vld1q_u8(dst - BPS); // top row1507const uint16x8_t p0 = vpaddlq_u8(A); // cascading summation of the top1508const uint16x4_t p1 = vadd_u16(vget_low_u16(p0), vget_high_u16(p0));1509const uint16x4_t p2 = vpadd_u16(p1, p1);1510const uint16x4_t p3 = vpadd_u16(p2, p2);1511sum_top = vcombine_u16(p3, p3);1512}15131514if (do_left) {1515int i;1516sum_left = vdupq_n_u16(0);1517for (i = 0; i < 16; i += 8) {1518const uint16x8_t L0 = vmovl_u8(vld1_u8(dst + (i + 0) * BPS - 1));1519const uint16x8_t L1 = vmovl_u8(vld1_u8(dst + (i + 1) * BPS - 1));1520const uint16x8_t L2 = vmovl_u8(vld1_u8(dst + (i + 2) * BPS - 1));1521const uint16x8_t L3 = vmovl_u8(vld1_u8(dst + (i + 3) * BPS - 1));1522const uint16x8_t L4 = vmovl_u8(vld1_u8(dst + (i + 4) * BPS - 1));1523const uint16x8_t L5 = vmovl_u8(vld1_u8(dst + (i + 5) * BPS - 1));1524const uint16x8_t L6 = vmovl_u8(vld1_u8(dst + (i + 6) * BPS - 1));1525const uint16x8_t L7 = vmovl_u8(vld1_u8(dst + (i + 7) * BPS - 1));1526const uint16x8_t s0 = vaddq_u16(L0, L1);1527const uint16x8_t s1 = vaddq_u16(L2, L3);1528const uint16x8_t s2 = vaddq_u16(L4, L5);1529const uint16x8_t s3 = vaddq_u16(L6, L7);1530const uint16x8_t s01 = vaddq_u16(s0, s1);1531const uint16x8_t s23 = vaddq_u16(s2, s3);1532const uint16x8_t sum = vaddq_u16(s01, s23);1533sum_left = vaddq_u16(sum_left, sum);1534}1535}15361537if (do_top && do_left) {1538const uint16x8_t sum = vaddq_u16(sum_left, sum_top);1539dc0 = vrshrn_n_u16(sum, 5);1540} else if (do_top) {1541dc0 = vrshrn_n_u16(sum_top, 4);1542} else if (do_left) {1543dc0 = vrshrn_n_u16(sum_left, 4);1544} else {1545dc0 = vdup_n_u8(0x80);1546}15471548{1549const uint8x16_t dc = vdupq_lane_u8(dc0, 0);1550int i;1551for (i = 0; i < 16; ++i) {1552vst1q_u8(dst + i * BPS, dc);1553}1554}1555}15561557static void DC16TopLeft_NEON(uint8_t* dst) { DC16_NEON(dst, 1, 1); }1558static void DC16NoTop_NEON(uint8_t* dst) { DC16_NEON(dst, 0, 1); }1559static void DC16NoLeft_NEON(uint8_t* dst) { DC16_NEON(dst, 1, 0); }1560static void DC16NoTopLeft_NEON(uint8_t* dst) { DC16_NEON(dst, 0, 0); }15611562static void TM16_NEON(uint8_t* dst) {1563const uint8x8_t TL = vld1_dup_u8(dst - BPS - 1); // top-left pixel 'A[-1]'1564const uint8x16_t T = vld1q_u8(dst - BPS); // top row 'A[0..15]'1565// A[c] - A[-1]1566const int16x8_t d_lo = vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(T), TL));1567const int16x8_t d_hi = vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(T), TL));1568int y;1569for (y = 0; y < 16; y += 4) {1570// left edge1571const int16x8_t L0 = ConvertU8ToS16_NEON(vld1_dup_u8(dst + 0 * BPS - 1));1572const int16x8_t L1 = ConvertU8ToS16_NEON(vld1_dup_u8(dst + 1 * BPS - 1));1573const int16x8_t L2 = ConvertU8ToS16_NEON(vld1_dup_u8(dst + 2 * BPS - 1));1574const int16x8_t L3 = ConvertU8ToS16_NEON(vld1_dup_u8(dst + 3 * BPS - 1));1575const int16x8_t r0_lo = vaddq_s16(L0, d_lo); // L[r] + A[c] - A[-1]1576const int16x8_t r1_lo = vaddq_s16(L1, d_lo);1577const int16x8_t r2_lo = vaddq_s16(L2, d_lo);1578const int16x8_t r3_lo = vaddq_s16(L3, d_lo);1579const int16x8_t r0_hi = vaddq_s16(L0, d_hi);1580const int16x8_t r1_hi = vaddq_s16(L1, d_hi);1581const int16x8_t r2_hi = vaddq_s16(L2, d_hi);1582const int16x8_t r3_hi = vaddq_s16(L3, d_hi);1583// Saturate and store the result.1584const uint8x16_t row0 = vcombine_u8(vqmovun_s16(r0_lo), vqmovun_s16(r0_hi));1585const uint8x16_t row1 = vcombine_u8(vqmovun_s16(r1_lo), vqmovun_s16(r1_hi));1586const uint8x16_t row2 = vcombine_u8(vqmovun_s16(r2_lo), vqmovun_s16(r2_hi));1587const uint8x16_t row3 = vcombine_u8(vqmovun_s16(r3_lo), vqmovun_s16(r3_hi));1588vst1q_u8(dst + 0 * BPS, row0);1589vst1q_u8(dst + 1 * BPS, row1);1590vst1q_u8(dst + 2 * BPS, row2);1591vst1q_u8(dst + 3 * BPS, row3);1592dst += 4 * BPS;1593}1594}15951596//------------------------------------------------------------------------------1597// Entry point15981599extern void VP8DspInitNEON(void);16001601WEBP_TSAN_IGNORE_FUNCTION void VP8DspInitNEON(void) {1602VP8Transform = TransformTwo_NEON;1603VP8TransformAC3 = TransformAC3_NEON;1604VP8TransformDC = TransformDC_NEON;1605VP8TransformWHT = TransformWHT_NEON;16061607VP8VFilter16 = VFilter16_NEON;1608VP8VFilter16i = VFilter16i_NEON;1609VP8HFilter16 = HFilter16_NEON;1610#if !defined(WORK_AROUND_GCC)1611VP8HFilter16i = HFilter16i_NEON;1612#endif1613VP8VFilter8 = VFilter8_NEON;1614VP8VFilter8i = VFilter8i_NEON;1615#if !defined(WORK_AROUND_GCC)1616VP8HFilter8 = HFilter8_NEON;1617VP8HFilter8i = HFilter8i_NEON;1618#endif1619VP8SimpleVFilter16 = SimpleVFilter16_NEON;1620VP8SimpleHFilter16 = SimpleHFilter16_NEON;1621VP8SimpleVFilter16i = SimpleVFilter16i_NEON;1622VP8SimpleHFilter16i = SimpleHFilter16i_NEON;16231624VP8PredLuma4[0] = DC4_NEON;1625VP8PredLuma4[1] = TM4_NEON;1626VP8PredLuma4[2] = VE4_NEON;1627VP8PredLuma4[4] = RD4_NEON;1628VP8PredLuma4[6] = LD4_NEON;16291630VP8PredLuma16[0] = DC16TopLeft_NEON;1631VP8PredLuma16[1] = TM16_NEON;1632VP8PredLuma16[2] = VE16_NEON;1633VP8PredLuma16[3] = HE16_NEON;1634VP8PredLuma16[4] = DC16NoTop_NEON;1635VP8PredLuma16[5] = DC16NoLeft_NEON;1636VP8PredLuma16[6] = DC16NoTopLeft_NEON;16371638VP8PredChroma8[0] = DC8uv_NEON;1639VP8PredChroma8[1] = TM8uv_NEON;1640VP8PredChroma8[2] = VE8uv_NEON;1641VP8PredChroma8[3] = HE8uv_NEON;1642VP8PredChroma8[4] = DC8uvNoTop_NEON;1643VP8PredChroma8[5] = DC8uvNoLeft_NEON;1644VP8PredChroma8[6] = DC8uvNoTopLeft_NEON;1645}16461647#else // !WEBP_USE_NEON16481649WEBP_DSP_INIT_STUB(VP8DspInitNEON)16501651#endif // WEBP_USE_NEON165216531654