CoCalc -- dec

GitHub Repository: godotengine/godot
Path: blob/master/thirdparty/libwebp/src/dsp/dec_neon.c
⁹⁹¹³ views
1
// Copyright 2012 Google Inc. All Rights Reserved.
2
//
3
// Use of this source code is governed by a BSD-style license
4
// that can be found in the COPYING file in the root of the source
5
// tree. An additional intellectual property rights grant can be found
6
// in the file PATENTS. All contributing project authors may
7
// be found in the AUTHORS file in the root of the source tree.
8
// -----------------------------------------------------------------------------
9
//
10
// ARM NEON version of dsp functions and loop filtering.
11
//
12
// Authors: Somnath Banerjee ([email protected])
13
//          Johann Koenig ([email protected])
14

15
#include "src/dsp/dsp.h"
16

17
#if defined(WEBP_USE_NEON)
18

19
#include "src/dsp/neon.h"
20
#include "src/dec/vp8i_dec.h"
21

22
//------------------------------------------------------------------------------
23
// NxM Loading functions
24

25
#if !defined(WORK_AROUND_GCC)
26

27
// This intrinsics version makes gcc-4.6.3 crash during Load4x??() compilation
28
// (register alloc, probably). The variants somewhat mitigate the problem, but
29
// not quite. HFilter16i() remains problematic.
30
static WEBP_INLINE uint8x8x4_t Load4x8_NEON(const uint8_t* const src,
31
                                            int stride) {
32
  const uint8x8_t zero = vdup_n_u8(0);
33
  uint8x8x4_t out;
34
  INIT_VECTOR4(out, zero, zero, zero, zero);
35
  out = vld4_lane_u8(src + 0 * stride, out, 0);
36
  out = vld4_lane_u8(src + 1 * stride, out, 1);
37
  out = vld4_lane_u8(src + 2 * stride, out, 2);
38
  out = vld4_lane_u8(src + 3 * stride, out, 3);
39
  out = vld4_lane_u8(src + 4 * stride, out, 4);
40
  out = vld4_lane_u8(src + 5 * stride, out, 5);
41
  out = vld4_lane_u8(src + 6 * stride, out, 6);
42
  out = vld4_lane_u8(src + 7 * stride, out, 7);
43
  return out;
44
}
45

46
static WEBP_INLINE void Load4x16_NEON(const uint8_t* const src, int stride,
47
                                      uint8x16_t* const p1,
48
                                      uint8x16_t* const p0,
49
                                      uint8x16_t* const q0,
50
                                      uint8x16_t* const q1) {
51
  // row0 = p1[0..7]|p0[0..7]|q0[0..7]|q1[0..7]
52
  // row8 = p1[8..15]|p0[8..15]|q0[8..15]|q1[8..15]
53
  const uint8x8x4_t row0 = Load4x8_NEON(src - 2 + 0 * stride, stride);
54
  const uint8x8x4_t row8 = Load4x8_NEON(src - 2 + 8 * stride, stride);
55
  *p1 = vcombine_u8(row0.val[0], row8.val[0]);
56
  *p0 = vcombine_u8(row0.val[1], row8.val[1]);
57
  *q0 = vcombine_u8(row0.val[2], row8.val[2]);
58
  *q1 = vcombine_u8(row0.val[3], row8.val[3]);
59
}
60

61
#else  // WORK_AROUND_GCC
62

63
#define LOADQ_LANE_32b(VALUE, LANE) do {                             \
64
  (VALUE) = vld1q_lane_u32((const uint32_t*)src, (VALUE), (LANE));   \
65
  src += stride;                                                     \
66
} while (0)
67

68
static WEBP_INLINE void Load4x16_NEON(const uint8_t* src, int stride,
69
                                      uint8x16_t* const p1,
70
                                      uint8x16_t* const p0,
71
                                      uint8x16_t* const q0,
72
                                      uint8x16_t* const q1) {
73
  const uint32x4_t zero = vdupq_n_u32(0);
74
  uint32x4x4_t in;
75
  INIT_VECTOR4(in, zero, zero, zero, zero);
76
  src -= 2;
77
  LOADQ_LANE_32b(in.val[0], 0);
78
  LOADQ_LANE_32b(in.val[1], 0);
79
  LOADQ_LANE_32b(in.val[2], 0);
80
  LOADQ_LANE_32b(in.val[3], 0);
81
  LOADQ_LANE_32b(in.val[0], 1);
82
  LOADQ_LANE_32b(in.val[1], 1);
83
  LOADQ_LANE_32b(in.val[2], 1);
84
  LOADQ_LANE_32b(in.val[3], 1);
85
  LOADQ_LANE_32b(in.val[0], 2);
86
  LOADQ_LANE_32b(in.val[1], 2);
87
  LOADQ_LANE_32b(in.val[2], 2);
88
  LOADQ_LANE_32b(in.val[3], 2);
89
  LOADQ_LANE_32b(in.val[0], 3);
90
  LOADQ_LANE_32b(in.val[1], 3);
91
  LOADQ_LANE_32b(in.val[2], 3);
92
  LOADQ_LANE_32b(in.val[3], 3);
93
  // Transpose four 4x4 parts:
94
  {
95
    const uint8x16x2_t row01 = vtrnq_u8(vreinterpretq_u8_u32(in.val[0]),
96
                                        vreinterpretq_u8_u32(in.val[1]));
97
    const uint8x16x2_t row23 = vtrnq_u8(vreinterpretq_u8_u32(in.val[2]),
98
                                        vreinterpretq_u8_u32(in.val[3]));
99
    const uint16x8x2_t row02 = vtrnq_u16(vreinterpretq_u16_u8(row01.val[0]),
100
                                         vreinterpretq_u16_u8(row23.val[0]));
101
    const uint16x8x2_t row13 = vtrnq_u16(vreinterpretq_u16_u8(row01.val[1]),
102
                                         vreinterpretq_u16_u8(row23.val[1]));
103
    *p1 = vreinterpretq_u8_u16(row02.val[0]);
104
    *p0 = vreinterpretq_u8_u16(row13.val[0]);
105
    *q0 = vreinterpretq_u8_u16(row02.val[1]);
106
    *q1 = vreinterpretq_u8_u16(row13.val[1]);
107
  }
108
}
109
#undef LOADQ_LANE_32b
110

111
#endif  // !WORK_AROUND_GCC
112

113
static WEBP_INLINE void Load8x16_NEON(
114
    const uint8_t* const src, int stride,
115
    uint8x16_t* const p3, uint8x16_t* const p2, uint8x16_t* const p1,
116
    uint8x16_t* const p0, uint8x16_t* const q0, uint8x16_t* const q1,
117
    uint8x16_t* const q2, uint8x16_t* const q3) {
118
  Load4x16_NEON(src - 2, stride, p3, p2, p1, p0);
119
  Load4x16_NEON(src + 2, stride, q0, q1, q2, q3);
120
}
121

122
static WEBP_INLINE void Load16x4_NEON(const uint8_t* const src, int stride,
123
                                      uint8x16_t* const p1,
124
                                      uint8x16_t* const p0,
125
                                      uint8x16_t* const q0,
126
                                      uint8x16_t* const q1) {
127
  *p1 = vld1q_u8(src - 2 * stride);
128
  *p0 = vld1q_u8(src - 1 * stride);
129
  *q0 = vld1q_u8(src + 0 * stride);
130
  *q1 = vld1q_u8(src + 1 * stride);
131
}
132

133
static WEBP_INLINE void Load16x8_NEON(
134
    const uint8_t* const src, int stride,
135
    uint8x16_t* const p3, uint8x16_t* const p2, uint8x16_t* const p1,
136
    uint8x16_t* const p0, uint8x16_t* const q0, uint8x16_t* const q1,
137
    uint8x16_t* const q2, uint8x16_t* const q3) {
138
  Load16x4_NEON(src - 2  * stride, stride, p3, p2, p1, p0);
139
  Load16x4_NEON(src + 2  * stride, stride, q0, q1, q2, q3);
140
}
141

142
static WEBP_INLINE void Load8x8x2_NEON(
143
    const uint8_t* const u, const uint8_t* const v, int stride,
144
    uint8x16_t* const p3, uint8x16_t* const p2, uint8x16_t* const p1,
145
    uint8x16_t* const p0, uint8x16_t* const q0, uint8x16_t* const q1,
146
    uint8x16_t* const q2, uint8x16_t* const q3) {
147
  // We pack the 8x8 u-samples in the lower half of the uint8x16_t destination
148
  // and the v-samples on the higher half.
149
  *p3 = vcombine_u8(vld1_u8(u - 4 * stride), vld1_u8(v - 4 * stride));
150
  *p2 = vcombine_u8(vld1_u8(u - 3 * stride), vld1_u8(v - 3 * stride));
151
  *p1 = vcombine_u8(vld1_u8(u - 2 * stride), vld1_u8(v - 2 * stride));
152
  *p0 = vcombine_u8(vld1_u8(u - 1 * stride), vld1_u8(v - 1 * stride));
153
  *q0 = vcombine_u8(vld1_u8(u + 0 * stride), vld1_u8(v + 0 * stride));
154
  *q1 = vcombine_u8(vld1_u8(u + 1 * stride), vld1_u8(v + 1 * stride));
155
  *q2 = vcombine_u8(vld1_u8(u + 2 * stride), vld1_u8(v + 2 * stride));
156
  *q3 = vcombine_u8(vld1_u8(u + 3 * stride), vld1_u8(v + 3 * stride));
157
}
158

159
#if !defined(WORK_AROUND_GCC)
160

161
#define LOAD_UV_8(ROW) \
162
  vcombine_u8(vld1_u8(u - 4 + (ROW) * stride), vld1_u8(v - 4 + (ROW) * stride))
163

164
static WEBP_INLINE void Load8x8x2T_NEON(
165
    const uint8_t* const u, const uint8_t* const v, int stride,
166
    uint8x16_t* const p3, uint8x16_t* const p2, uint8x16_t* const p1,
167
    uint8x16_t* const p0, uint8x16_t* const q0, uint8x16_t* const q1,
168
    uint8x16_t* const q2, uint8x16_t* const q3) {
169
  // We pack the 8x8 u-samples in the lower half of the uint8x16_t destination
170
  // and the v-samples on the higher half.
171
  const uint8x16_t row0 = LOAD_UV_8(0);
172
  const uint8x16_t row1 = LOAD_UV_8(1);
173
  const uint8x16_t row2 = LOAD_UV_8(2);
174
  const uint8x16_t row3 = LOAD_UV_8(3);
175
  const uint8x16_t row4 = LOAD_UV_8(4);
176
  const uint8x16_t row5 = LOAD_UV_8(5);
177
  const uint8x16_t row6 = LOAD_UV_8(6);
178
  const uint8x16_t row7 = LOAD_UV_8(7);
179
  // Perform two side-by-side 8x8 transposes
180
  // u00 u01 u02 u03 u04 u05 u06 u07 | v00 v01 v02 v03 v04 v05 v06 v07
181
  // u10 u11 u12 u13 u14 u15 u16 u17 | v10 v11 v12 ...
182
  // u20 u21 u22 u23 u24 u25 u26 u27 | v20 v21 ...
183
  // u30 u31 u32 u33 u34 u35 u36 u37 | ...
184
  // u40 u41 u42 u43 u44 u45 u46 u47 | ...
185
  // u50 u51 u52 u53 u54 u55 u56 u57 | ...
186
  // u60 u61 u62 u63 u64 u65 u66 u67 | v60 ...
187
  // u70 u71 u72 u73 u74 u75 u76 u77 | v70 v71 v72 ...
188
  const uint8x16x2_t row01 = vtrnq_u8(row0, row1);  // u00 u10 u02 u12 ...
189
                                                    // u01 u11 u03 u13 ...
190
  const uint8x16x2_t row23 = vtrnq_u8(row2, row3);  // u20 u30 u22 u32 ...
191
                                                    // u21 u31 u23 u33 ...
192
  const uint8x16x2_t row45 = vtrnq_u8(row4, row5);  // ...
193
  const uint8x16x2_t row67 = vtrnq_u8(row6, row7);  // ...
194
  const uint16x8x2_t row02 = vtrnq_u16(vreinterpretq_u16_u8(row01.val[0]),
195
                                       vreinterpretq_u16_u8(row23.val[0]));
196
  const uint16x8x2_t row13 = vtrnq_u16(vreinterpretq_u16_u8(row01.val[1]),
197
                                       vreinterpretq_u16_u8(row23.val[1]));
198
  const uint16x8x2_t row46 = vtrnq_u16(vreinterpretq_u16_u8(row45.val[0]),
199
                                       vreinterpretq_u16_u8(row67.val[0]));
200
  const uint16x8x2_t row57 = vtrnq_u16(vreinterpretq_u16_u8(row45.val[1]),
201
                                       vreinterpretq_u16_u8(row67.val[1]));
202
  const uint32x4x2_t row04 = vtrnq_u32(vreinterpretq_u32_u16(row02.val[0]),
203
                                       vreinterpretq_u32_u16(row46.val[0]));
204
  const uint32x4x2_t row26 = vtrnq_u32(vreinterpretq_u32_u16(row02.val[1]),
205
                                       vreinterpretq_u32_u16(row46.val[1]));
206
  const uint32x4x2_t row15 = vtrnq_u32(vreinterpretq_u32_u16(row13.val[0]),
207
                                       vreinterpretq_u32_u16(row57.val[0]));
208
  const uint32x4x2_t row37 = vtrnq_u32(vreinterpretq_u32_u16(row13.val[1]),
209
                                       vreinterpretq_u32_u16(row57.val[1]));
210
  *p3 = vreinterpretq_u8_u32(row04.val[0]);
211
  *p2 = vreinterpretq_u8_u32(row15.val[0]);
212
  *p1 = vreinterpretq_u8_u32(row26.val[0]);
213
  *p0 = vreinterpretq_u8_u32(row37.val[0]);
214
  *q0 = vreinterpretq_u8_u32(row04.val[1]);
215
  *q1 = vreinterpretq_u8_u32(row15.val[1]);
216
  *q2 = vreinterpretq_u8_u32(row26.val[1]);
217
  *q3 = vreinterpretq_u8_u32(row37.val[1]);
218
}
219
#undef LOAD_UV_8
220

221
#endif  // !WORK_AROUND_GCC
222

223
static WEBP_INLINE void Store2x8_NEON(const uint8x8x2_t v,
224
                                      uint8_t* const dst, int stride) {
225
  vst2_lane_u8(dst + 0 * stride, v, 0);
226
  vst2_lane_u8(dst + 1 * stride, v, 1);
227
  vst2_lane_u8(dst + 2 * stride, v, 2);
228
  vst2_lane_u8(dst + 3 * stride, v, 3);
229
  vst2_lane_u8(dst + 4 * stride, v, 4);
230
  vst2_lane_u8(dst + 5 * stride, v, 5);
231
  vst2_lane_u8(dst + 6 * stride, v, 6);
232
  vst2_lane_u8(dst + 7 * stride, v, 7);
233
}
234

235
static WEBP_INLINE void Store2x16_NEON(const uint8x16_t p0, const uint8x16_t q0,
236
                                       uint8_t* const dst, int stride) {
237
  uint8x8x2_t lo, hi;
238
  lo.val[0] = vget_low_u8(p0);
239
  lo.val[1] = vget_low_u8(q0);
240
  hi.val[0] = vget_high_u8(p0);
241
  hi.val[1] = vget_high_u8(q0);
242
  Store2x8_NEON(lo, dst - 1 + 0 * stride, stride);
243
  Store2x8_NEON(hi, dst - 1 + 8 * stride, stride);
244
}
245

246
#if !defined(WORK_AROUND_GCC)
247
static WEBP_INLINE void Store4x8_NEON(const uint8x8x4_t v,
248
                                      uint8_t* const dst, int stride) {
249
  vst4_lane_u8(dst + 0 * stride, v, 0);
250
  vst4_lane_u8(dst + 1 * stride, v, 1);
251
  vst4_lane_u8(dst + 2 * stride, v, 2);
252
  vst4_lane_u8(dst + 3 * stride, v, 3);
253
  vst4_lane_u8(dst + 4 * stride, v, 4);
254
  vst4_lane_u8(dst + 5 * stride, v, 5);
255
  vst4_lane_u8(dst + 6 * stride, v, 6);
256
  vst4_lane_u8(dst + 7 * stride, v, 7);
257
}
258

259
static WEBP_INLINE void Store4x16_NEON(const uint8x16_t p1, const uint8x16_t p0,
260
                                       const uint8x16_t q0, const uint8x16_t q1,
261
                                       uint8_t* const dst, int stride) {
262
  uint8x8x4_t lo, hi;
263
  INIT_VECTOR4(lo,
264
               vget_low_u8(p1), vget_low_u8(p0),
265
               vget_low_u8(q0), vget_low_u8(q1));
266
  INIT_VECTOR4(hi,
267
               vget_high_u8(p1), vget_high_u8(p0),
268
               vget_high_u8(q0), vget_high_u8(q1));
269
  Store4x8_NEON(lo, dst - 2 + 0 * stride, stride);
270
  Store4x8_NEON(hi, dst - 2 + 8 * stride, stride);
271
}
272
#endif  // !WORK_AROUND_GCC
273

274
static WEBP_INLINE void Store16x2_NEON(const uint8x16_t p0, const uint8x16_t q0,
275
                                       uint8_t* const dst, int stride) {
276
  vst1q_u8(dst - stride, p0);
277
  vst1q_u8(dst, q0);
278
}
279

280
static WEBP_INLINE void Store16x4_NEON(const uint8x16_t p1, const uint8x16_t p0,
281
                                       const uint8x16_t q0, const uint8x16_t q1,
282
                                       uint8_t* const dst, int stride) {
283
  Store16x2_NEON(p1, p0, dst - stride, stride);
284
  Store16x2_NEON(q0, q1, dst + stride, stride);
285
}
286

287
static WEBP_INLINE void Store8x2x2_NEON(const uint8x16_t p0,
288
                                        const uint8x16_t q0,
289
                                        uint8_t* const u, uint8_t* const v,
290
                                        int stride) {
291
  // p0 and q0 contain the u+v samples packed in low/high halves.
292
  vst1_u8(u - stride, vget_low_u8(p0));
293
  vst1_u8(u,          vget_low_u8(q0));
294
  vst1_u8(v - stride, vget_high_u8(p0));
295
  vst1_u8(v,          vget_high_u8(q0));
296
}
297

298
static WEBP_INLINE void Store8x4x2_NEON(const uint8x16_t p1,
299
                                        const uint8x16_t p0,
300
                                        const uint8x16_t q0,
301
                                        const uint8x16_t q1,
302
                                        uint8_t* const u, uint8_t* const v,
303
                                        int stride) {
304
  // The p1...q1 registers contain the u+v samples packed in low/high halves.
305
  Store8x2x2_NEON(p1, p0, u - stride, v - stride, stride);
306
  Store8x2x2_NEON(q0, q1, u + stride, v + stride, stride);
307
}
308

309
#if !defined(WORK_AROUND_GCC)
310

311
#define STORE6_LANE(DST, VAL0, VAL1, LANE) do {   \
312
  vst3_lane_u8((DST) - 3, (VAL0), (LANE));        \
313
  vst3_lane_u8((DST) + 0, (VAL1), (LANE));        \
314
  (DST) += stride;                                \
315
} while (0)
316

317
static WEBP_INLINE void Store6x8x2_NEON(
318
    const uint8x16_t p2, const uint8x16_t p1, const uint8x16_t p0,
319
    const uint8x16_t q0, const uint8x16_t q1, const uint8x16_t q2,
320
    uint8_t* u, uint8_t* v, int stride) {
321
  uint8x8x3_t u0, u1, v0, v1;
322
  INIT_VECTOR3(u0, vget_low_u8(p2), vget_low_u8(p1), vget_low_u8(p0));
323
  INIT_VECTOR3(u1, vget_low_u8(q0), vget_low_u8(q1), vget_low_u8(q2));
324
  INIT_VECTOR3(v0, vget_high_u8(p2), vget_high_u8(p1), vget_high_u8(p0));
325
  INIT_VECTOR3(v1, vget_high_u8(q0), vget_high_u8(q1), vget_high_u8(q2));
326
  STORE6_LANE(u, u0, u1, 0);
327
  STORE6_LANE(u, u0, u1, 1);
328
  STORE6_LANE(u, u0, u1, 2);
329
  STORE6_LANE(u, u0, u1, 3);
330
  STORE6_LANE(u, u0, u1, 4);
331
  STORE6_LANE(u, u0, u1, 5);
332
  STORE6_LANE(u, u0, u1, 6);
333
  STORE6_LANE(u, u0, u1, 7);
334
  STORE6_LANE(v, v0, v1, 0);
335
  STORE6_LANE(v, v0, v1, 1);
336
  STORE6_LANE(v, v0, v1, 2);
337
  STORE6_LANE(v, v0, v1, 3);
338
  STORE6_LANE(v, v0, v1, 4);
339
  STORE6_LANE(v, v0, v1, 5);
340
  STORE6_LANE(v, v0, v1, 6);
341
  STORE6_LANE(v, v0, v1, 7);
342
}
343
#undef STORE6_LANE
344

345
static WEBP_INLINE void Store4x8x2_NEON(const uint8x16_t p1,
346
                                        const uint8x16_t p0,
347
                                        const uint8x16_t q0,
348
                                        const uint8x16_t q1,
349
                                        uint8_t* const u, uint8_t* const v,
350
                                        int stride) {
351
  uint8x8x4_t u0, v0;
352
  INIT_VECTOR4(u0,
353
               vget_low_u8(p1), vget_low_u8(p0),
354
               vget_low_u8(q0), vget_low_u8(q1));
355
  INIT_VECTOR4(v0,
356
               vget_high_u8(p1), vget_high_u8(p0),
357
               vget_high_u8(q0), vget_high_u8(q1));
358
  vst4_lane_u8(u - 2 + 0 * stride, u0, 0);
359
  vst4_lane_u8(u - 2 + 1 * stride, u0, 1);
360
  vst4_lane_u8(u - 2 + 2 * stride, u0, 2);
361
  vst4_lane_u8(u - 2 + 3 * stride, u0, 3);
362
  vst4_lane_u8(u - 2 + 4 * stride, u0, 4);
363
  vst4_lane_u8(u - 2 + 5 * stride, u0, 5);
364
  vst4_lane_u8(u - 2 + 6 * stride, u0, 6);
365
  vst4_lane_u8(u - 2 + 7 * stride, u0, 7);
366
  vst4_lane_u8(v - 2 + 0 * stride, v0, 0);
367
  vst4_lane_u8(v - 2 + 1 * stride, v0, 1);
368
  vst4_lane_u8(v - 2 + 2 * stride, v0, 2);
369
  vst4_lane_u8(v - 2 + 3 * stride, v0, 3);
370
  vst4_lane_u8(v - 2 + 4 * stride, v0, 4);
371
  vst4_lane_u8(v - 2 + 5 * stride, v0, 5);
372
  vst4_lane_u8(v - 2 + 6 * stride, v0, 6);
373
  vst4_lane_u8(v - 2 + 7 * stride, v0, 7);
374
}
375

376
#endif  // !WORK_AROUND_GCC
377

378
// Zero extend 'v' to an int16x8_t.
379
static WEBP_INLINE int16x8_t ConvertU8ToS16_NEON(uint8x8_t v) {
380
  return vreinterpretq_s16_u16(vmovl_u8(v));
381
}
382

383
// Performs unsigned 8b saturation on 'dst01' and 'dst23' storing the result
384
// to the corresponding rows of 'dst'.
385
static WEBP_INLINE void SaturateAndStore4x4_NEON(uint8_t* const dst,
386
                                                 const int16x8_t dst01,
387
                                                 const int16x8_t dst23) {
388
  // Unsigned saturate to 8b.
389
  const uint8x8_t dst01_u8 = vqmovun_s16(dst01);
390
  const uint8x8_t dst23_u8 = vqmovun_s16(dst23);
391

392
  // Store the results.
393
  vst1_lane_u32((uint32_t*)(dst + 0 * BPS), vreinterpret_u32_u8(dst01_u8), 0);
394
  vst1_lane_u32((uint32_t*)(dst + 1 * BPS), vreinterpret_u32_u8(dst01_u8), 1);
395
  vst1_lane_u32((uint32_t*)(dst + 2 * BPS), vreinterpret_u32_u8(dst23_u8), 0);
396
  vst1_lane_u32((uint32_t*)(dst + 3 * BPS), vreinterpret_u32_u8(dst23_u8), 1);
397
}
398

399
static WEBP_INLINE void Add4x4_NEON(const int16x8_t row01,
400
                                    const int16x8_t row23,
401
                                    uint8_t* const dst) {
402
  uint32x2_t dst01 = vdup_n_u32(0);
403
  uint32x2_t dst23 = vdup_n_u32(0);
404

405
  // Load the source pixels.
406
  dst01 = vld1_lane_u32((uint32_t*)(dst + 0 * BPS), dst01, 0);
407
  dst23 = vld1_lane_u32((uint32_t*)(dst + 2 * BPS), dst23, 0);
408
  dst01 = vld1_lane_u32((uint32_t*)(dst + 1 * BPS), dst01, 1);
409
  dst23 = vld1_lane_u32((uint32_t*)(dst + 3 * BPS), dst23, 1);
410

411
  {
412
    // Convert to 16b.
413
    const int16x8_t dst01_s16 = ConvertU8ToS16_NEON(vreinterpret_u8_u32(dst01));
414
    const int16x8_t dst23_s16 = ConvertU8ToS16_NEON(vreinterpret_u8_u32(dst23));
415

416
    // Descale with rounding.
417
    const int16x8_t out01 = vrsraq_n_s16(dst01_s16, row01, 3);
418
    const int16x8_t out23 = vrsraq_n_s16(dst23_s16, row23, 3);
419
    // Add the inverse transform.
420
    SaturateAndStore4x4_NEON(dst, out01, out23);
421
  }
422
}
423

424
//-----------------------------------------------------------------------------
425
// Simple In-loop filtering (Paragraph 15.2)
426

427
static uint8x16_t NeedsFilter_NEON(const uint8x16_t p1, const uint8x16_t p0,
428
                                   const uint8x16_t q0, const uint8x16_t q1,
429
                                   int thresh) {
430
  const uint8x16_t thresh_v = vdupq_n_u8((uint8_t)thresh);
431
  const uint8x16_t a_p0_q0 = vabdq_u8(p0, q0);               // abs(p0-q0)
432
  const uint8x16_t a_p1_q1 = vabdq_u8(p1, q1);               // abs(p1-q1)
433
  const uint8x16_t a_p0_q0_2 = vqaddq_u8(a_p0_q0, a_p0_q0);  // 2 * abs(p0-q0)
434
  const uint8x16_t a_p1_q1_2 = vshrq_n_u8(a_p1_q1, 1);       // abs(p1-q1) / 2
435
  const uint8x16_t sum = vqaddq_u8(a_p0_q0_2, a_p1_q1_2);
436
  const uint8x16_t mask = vcgeq_u8(thresh_v, sum);
437
  return mask;
438
}
439

440
static int8x16_t FlipSign_NEON(const uint8x16_t v) {
441
  const uint8x16_t sign_bit = vdupq_n_u8(0x80);
442
  return vreinterpretq_s8_u8(veorq_u8(v, sign_bit));
443
}
444

445
static uint8x16_t FlipSignBack_NEON(const int8x16_t v) {
446
  const int8x16_t sign_bit = vdupq_n_s8(0x80);
447
  return vreinterpretq_u8_s8(veorq_s8(v, sign_bit));
448
}
449

450
static int8x16_t GetBaseDelta_NEON(const int8x16_t p1, const int8x16_t p0,
451
                                   const int8x16_t q0, const int8x16_t q1) {
452
  const int8x16_t q0_p0 = vqsubq_s8(q0, p0);      // (q0-p0)
453
  const int8x16_t p1_q1 = vqsubq_s8(p1, q1);      // (p1-q1)
454
  const int8x16_t s1 = vqaddq_s8(p1_q1, q0_p0);   // (p1-q1) + 1 * (q0 - p0)
455
  const int8x16_t s2 = vqaddq_s8(q0_p0, s1);      // (p1-q1) + 2 * (q0 - p0)
456
  const int8x16_t s3 = vqaddq_s8(q0_p0, s2);      // (p1-q1) + 3 * (q0 - p0)
457
  return s3;
458
}
459

460
static int8x16_t GetBaseDelta0_NEON(const int8x16_t p0, const int8x16_t q0) {
461
  const int8x16_t q0_p0 = vqsubq_s8(q0, p0);      // (q0-p0)
462
  const int8x16_t s1 = vqaddq_s8(q0_p0, q0_p0);   // 2 * (q0 - p0)
463
  const int8x16_t s2 = vqaddq_s8(q0_p0, s1);      // 3 * (q0 - p0)
464
  return s2;
465
}
466

467
//------------------------------------------------------------------------------
468

469
static void ApplyFilter2NoFlip_NEON(const int8x16_t p0s, const int8x16_t q0s,
470
                                    const int8x16_t delta,
471
                                    int8x16_t* const op0,
472
                                    int8x16_t* const oq0) {
473
  const int8x16_t kCst3 = vdupq_n_s8(0x03);
474
  const int8x16_t kCst4 = vdupq_n_s8(0x04);
475
  const int8x16_t delta_p3 = vqaddq_s8(delta, kCst3);
476
  const int8x16_t delta_p4 = vqaddq_s8(delta, kCst4);
477
  const int8x16_t delta3 = vshrq_n_s8(delta_p3, 3);
478
  const int8x16_t delta4 = vshrq_n_s8(delta_p4, 3);
479
  *op0 = vqaddq_s8(p0s, delta3);
480
  *oq0 = vqsubq_s8(q0s, delta4);
481
}
482

483
#if defined(WEBP_USE_INTRINSICS)
484

485
static void ApplyFilter2_NEON(const int8x16_t p0s, const int8x16_t q0s,
486
                              const int8x16_t delta,
487
                              uint8x16_t* const op0, uint8x16_t* const oq0) {
488
  const int8x16_t kCst3 = vdupq_n_s8(0x03);
489
  const int8x16_t kCst4 = vdupq_n_s8(0x04);
490
  const int8x16_t delta_p3 = vqaddq_s8(delta, kCst3);
491
  const int8x16_t delta_p4 = vqaddq_s8(delta, kCst4);
492
  const int8x16_t delta3 = vshrq_n_s8(delta_p3, 3);
493
  const int8x16_t delta4 = vshrq_n_s8(delta_p4, 3);
494
  const int8x16_t sp0 = vqaddq_s8(p0s, delta3);
495
  const int8x16_t sq0 = vqsubq_s8(q0s, delta4);
496
  *op0 = FlipSignBack_NEON(sp0);
497
  *oq0 = FlipSignBack_NEON(sq0);
498
}
499

500
static void DoFilter2_NEON(const uint8x16_t p1, const uint8x16_t p0,
501
                           const uint8x16_t q0, const uint8x16_t q1,
502
                           const uint8x16_t mask,
503
                           uint8x16_t* const op0, uint8x16_t* const oq0) {
504
  const int8x16_t p1s = FlipSign_NEON(p1);
505
  const int8x16_t p0s = FlipSign_NEON(p0);
506
  const int8x16_t q0s = FlipSign_NEON(q0);
507
  const int8x16_t q1s = FlipSign_NEON(q1);
508
  const int8x16_t delta0 = GetBaseDelta_NEON(p1s, p0s, q0s, q1s);
509
  const int8x16_t delta1 = vandq_s8(delta0, vreinterpretq_s8_u8(mask));
510
  ApplyFilter2_NEON(p0s, q0s, delta1, op0, oq0);
511
}
512

513
static void SimpleVFilter16_NEON(uint8_t* p, int stride, int thresh) {
514
  uint8x16_t p1, p0, q0, q1, op0, oq0;
515
  Load16x4_NEON(p, stride, &p1, &p0, &q0, &q1);
516
  {
517
    const uint8x16_t mask = NeedsFilter_NEON(p1, p0, q0, q1, thresh);
518
    DoFilter2_NEON(p1, p0, q0, q1, mask, &op0, &oq0);
519
  }
520
  Store16x2_NEON(op0, oq0, p, stride);
521
}
522

523
static void SimpleHFilter16_NEON(uint8_t* p, int stride, int thresh) {
524
  uint8x16_t p1, p0, q0, q1, oq0, op0;
525
  Load4x16_NEON(p, stride, &p1, &p0, &q0, &q1);
526
  {
527
    const uint8x16_t mask = NeedsFilter_NEON(p1, p0, q0, q1, thresh);
528
    DoFilter2_NEON(p1, p0, q0, q1, mask, &op0, &oq0);
529
  }
530
  Store2x16_NEON(op0, oq0, p, stride);
531
}
532

533
#else
534

535
// Load/Store vertical edge
536
#define LOAD8x4(c1, c2, c3, c4, b1, b2, stride)                                \
537
  "vld4.8 {" #c1 "[0]," #c2 "[0]," #c3 "[0]," #c4 "[0]}," #b1 "," #stride "\n" \
538
  "vld4.8 {" #c1 "[1]," #c2 "[1]," #c3 "[1]," #c4 "[1]}," #b2 "," #stride "\n" \
539
  "vld4.8 {" #c1 "[2]," #c2 "[2]," #c3 "[2]," #c4 "[2]}," #b1 "," #stride "\n" \
540
  "vld4.8 {" #c1 "[3]," #c2 "[3]," #c3 "[3]," #c4 "[3]}," #b2 "," #stride "\n" \
541
  "vld4.8 {" #c1 "[4]," #c2 "[4]," #c3 "[4]," #c4 "[4]}," #b1 "," #stride "\n" \
542
  "vld4.8 {" #c1 "[5]," #c2 "[5]," #c3 "[5]," #c4 "[5]}," #b2 "," #stride "\n" \
543
  "vld4.8 {" #c1 "[6]," #c2 "[6]," #c3 "[6]," #c4 "[6]}," #b1 "," #stride "\n" \
544
  "vld4.8 {" #c1 "[7]," #c2 "[7]," #c3 "[7]," #c4 "[7]}," #b2 "," #stride "\n"
545

546
#define STORE8x2(c1, c2, p, stride)                                            \
547
  "vst2.8   {" #c1 "[0], " #c2 "[0]}," #p "," #stride " \n"                    \
548
  "vst2.8   {" #c1 "[1], " #c2 "[1]}," #p "," #stride " \n"                    \
549
  "vst2.8   {" #c1 "[2], " #c2 "[2]}," #p "," #stride " \n"                    \
550
  "vst2.8   {" #c1 "[3], " #c2 "[3]}," #p "," #stride " \n"                    \
551
  "vst2.8   {" #c1 "[4], " #c2 "[4]}," #p "," #stride " \n"                    \
552
  "vst2.8   {" #c1 "[5], " #c2 "[5]}," #p "," #stride " \n"                    \
553
  "vst2.8   {" #c1 "[6], " #c2 "[6]}," #p "," #stride " \n"                    \
554
  "vst2.8   {" #c1 "[7], " #c2 "[7]}," #p "," #stride " \n"
555

556
#define QRegs "q0", "q1", "q2", "q3",                                          \
557
              "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
558

559
#define FLIP_SIGN_BIT2(a, b, s)                                                \
560
  "veor     " #a "," #a "," #s "               \n"                             \
561
  "veor     " #b "," #b "," #s "               \n"                             \
562

563
#define FLIP_SIGN_BIT4(a, b, c, d, s)                                          \
564
  FLIP_SIGN_BIT2(a, b, s)                                                      \
565
  FLIP_SIGN_BIT2(c, d, s)                                                      \
566

567
#define NEEDS_FILTER(p1, p0, q0, q1, thresh, mask)                             \
568
  "vabd.u8    q15," #p0 "," #q0 "         \n"  /* abs(p0 - q0) */              \
569
  "vabd.u8    q14," #p1 "," #q1 "         \n"  /* abs(p1 - q1) */              \
570
  "vqadd.u8   q15, q15, q15               \n"  /* abs(p0 - q0) * 2 */          \
571
  "vshr.u8    q14, q14, #1                \n"  /* abs(p1 - q1) / 2 */          \
572
  "vqadd.u8   q15, q15, q14     \n"  /* abs(p0 - q0) * 2 + abs(p1 - q1) / 2 */ \
573
  "vdup.8     q14, " #thresh "            \n"                                  \
574
  "vcge.u8   " #mask ", q14, q15          \n"  /* mask <= thresh */
575

576
#define GET_BASE_DELTA(p1, p0, q0, q1, o)                                      \
577
  "vqsub.s8   q15," #q0 "," #p0 "         \n"  /* (q0 - p0) */                 \
578
  "vqsub.s8  " #o "," #p1 "," #q1 "       \n"  /* (p1 - q1) */                 \
579
  "vqadd.s8  " #o "," #o ", q15           \n"  /* (p1 - q1) + 1 * (p0 - q0) */ \
580
  "vqadd.s8  " #o "," #o ", q15           \n"  /* (p1 - q1) + 2 * (p0 - q0) */ \
581
  "vqadd.s8  " #o "," #o ", q15           \n"  /* (p1 - q1) + 3 * (p0 - q0) */
582

583
#define DO_SIMPLE_FILTER(p0, q0, fl)                                           \
584
  "vmov.i8    q15, #0x03                  \n"                                  \
585
  "vqadd.s8   q15, q15, " #fl "           \n"  /* filter1 = filter + 3 */      \
586
  "vshr.s8    q15, q15, #3                \n"  /* filter1 >> 3 */              \
587
  "vqadd.s8  " #p0 "," #p0 ", q15         \n"  /* p0 += filter1 */             \
588
                                                                               \
589
  "vmov.i8    q15, #0x04                  \n"                                  \
590
  "vqadd.s8   q15, q15, " #fl "           \n"  /* filter1 = filter + 4 */      \
591
  "vshr.s8    q15, q15, #3                \n"  /* filter2 >> 3 */              \
592
  "vqsub.s8  " #q0 "," #q0 ", q15         \n"  /* q0 -= filter2 */
593

594
// Applies filter on 2 pixels (p0 and q0)
595
#define DO_FILTER2(p1, p0, q0, q1, thresh)                                     \
596
  NEEDS_FILTER(p1, p0, q0, q1, thresh, q9)     /* filter mask in q9 */         \
597
  "vmov.i8    q10, #0x80                  \n"  /* sign bit */                  \
598
  FLIP_SIGN_BIT4(p1, p0, q0, q1, q10)          /* convert to signed value */   \
599
  GET_BASE_DELTA(p1, p0, q0, q1, q11)          /* get filter level  */         \
600
  "vand       q9, q9, q11                 \n"  /* apply filter mask */         \
601
  DO_SIMPLE_FILTER(p0, q0, q9)                 /* apply filter */              \
602
  FLIP_SIGN_BIT2(p0, q0, q10)
603

604
static void SimpleVFilter16_NEON(uint8_t* p, int stride, int thresh) {
605
  __asm__ volatile (
606
    "sub        %[p], %[p], %[stride], lsl #1  \n"  // p -= 2 * stride
607

608
    "vld1.u8    {q1}, [%[p]], %[stride]        \n"  // p1
609
    "vld1.u8    {q2}, [%[p]], %[stride]        \n"  // p0
610
    "vld1.u8    {q3}, [%[p]], %[stride]        \n"  // q0
611
    "vld1.u8    {q12}, [%[p]]                  \n"  // q1
612

613
    DO_FILTER2(q1, q2, q3, q12, %[thresh])
614

615
    "sub        %[p], %[p], %[stride], lsl #1  \n"  // p -= 2 * stride
616

617
    "vst1.u8    {q2}, [%[p]], %[stride]        \n"  // store op0
618
    "vst1.u8    {q3}, [%[p]]                   \n"  // store oq0
619
    : [p] "+r"(p)
620
    : [stride] "r"(stride), [thresh] "r"(thresh)
621
    : "memory", QRegs
622
  );
623
}
624

625
static void SimpleHFilter16_NEON(uint8_t* p, int stride, int thresh) {
626
  __asm__ volatile (
627
    "sub        r4, %[p], #2                   \n"  // base1 = p - 2
628
    "lsl        r6, %[stride], #1              \n"  // r6 = 2 * stride
629
    "add        r5, r4, %[stride]              \n"  // base2 = base1 + stride
630

631
    LOAD8x4(d2, d3, d4, d5, [r4], [r5], r6)
632
    LOAD8x4(d24, d25, d26, d27, [r4], [r5], r6)
633
    "vswp       d3, d24                        \n"  // p1:q1 p0:q3
634
    "vswp       d5, d26                        \n"  // q0:q2 q1:q4
635
    "vswp       q2, q12                        \n"  // p1:q1 p0:q2 q0:q3 q1:q4
636

637
    DO_FILTER2(q1, q2, q12, q13, %[thresh])
638

639
    "sub        %[p], %[p], #1                 \n"  // p - 1
640

641
    "vswp        d5, d24                       \n"
642
    STORE8x2(d4, d5, [%[p]], %[stride])
643
    STORE8x2(d24, d25, [%[p]], %[stride])
644

645
    : [p] "+r"(p)
646
    : [stride] "r"(stride), [thresh] "r"(thresh)
647
    : "memory", "r4", "r5", "r6", QRegs
648
  );
649
}
650

651
#undef LOAD8x4
652
#undef STORE8x2
653

654
#endif    // WEBP_USE_INTRINSICS
655

656
static void SimpleVFilter16i_NEON(uint8_t* p, int stride, int thresh) {
657
  uint32_t k;
658
  for (k = 3; k != 0; --k) {
659
    p += 4 * stride;
660
    SimpleVFilter16_NEON(p, stride, thresh);
661
  }
662
}
663

664
static void SimpleHFilter16i_NEON(uint8_t* p, int stride, int thresh) {
665
  uint32_t k;
666
  for (k = 3; k != 0; --k) {
667
    p += 4;
668
    SimpleHFilter16_NEON(p, stride, thresh);
669
  }
670
}
671

672
//------------------------------------------------------------------------------
673
// Complex In-loop filtering (Paragraph 15.3)
674

675
static uint8x16_t NeedsHev_NEON(const uint8x16_t p1, const uint8x16_t p0,
676
                                const uint8x16_t q0, const uint8x16_t q1,
677
                                int hev_thresh) {
678
  const uint8x16_t hev_thresh_v = vdupq_n_u8((uint8_t)hev_thresh);
679
  const uint8x16_t a_p1_p0 = vabdq_u8(p1, p0);  // abs(p1 - p0)
680
  const uint8x16_t a_q1_q0 = vabdq_u8(q1, q0);  // abs(q1 - q0)
681
  const uint8x16_t a_max = vmaxq_u8(a_p1_p0, a_q1_q0);
682
  const uint8x16_t mask = vcgtq_u8(a_max, hev_thresh_v);
683
  return mask;
684
}
685

686
static uint8x16_t NeedsFilter2_NEON(const uint8x16_t p3, const uint8x16_t p2,
687
                                    const uint8x16_t p1, const uint8x16_t p0,
688
                                    const uint8x16_t q0, const uint8x16_t q1,
689
                                    const uint8x16_t q2, const uint8x16_t q3,
690
                                    int ithresh, int thresh) {
691
  const uint8x16_t ithresh_v = vdupq_n_u8((uint8_t)ithresh);
692
  const uint8x16_t a_p3_p2 = vabdq_u8(p3, p2);  // abs(p3 - p2)
693
  const uint8x16_t a_p2_p1 = vabdq_u8(p2, p1);  // abs(p2 - p1)
694
  const uint8x16_t a_p1_p0 = vabdq_u8(p1, p0);  // abs(p1 - p0)
695
  const uint8x16_t a_q3_q2 = vabdq_u8(q3, q2);  // abs(q3 - q2)
696
  const uint8x16_t a_q2_q1 = vabdq_u8(q2, q1);  // abs(q2 - q1)
697
  const uint8x16_t a_q1_q0 = vabdq_u8(q1, q0);  // abs(q1 - q0)
698
  const uint8x16_t max1 = vmaxq_u8(a_p3_p2, a_p2_p1);
699
  const uint8x16_t max2 = vmaxq_u8(a_p1_p0, a_q3_q2);
700
  const uint8x16_t max3 = vmaxq_u8(a_q2_q1, a_q1_q0);
701
  const uint8x16_t max12 = vmaxq_u8(max1, max2);
702
  const uint8x16_t max123 = vmaxq_u8(max12, max3);
703
  const uint8x16_t mask2 = vcgeq_u8(ithresh_v, max123);
704
  const uint8x16_t mask1 = NeedsFilter_NEON(p1, p0, q0, q1, thresh);
705
  const uint8x16_t mask = vandq_u8(mask1, mask2);
706
  return mask;
707
}
708

709
//  4-points filter
710

711
static void ApplyFilter4_NEON(
712
    const int8x16_t p1, const int8x16_t p0,
713
    const int8x16_t q0, const int8x16_t q1,
714
    const int8x16_t delta0,
715
    uint8x16_t* const op1, uint8x16_t* const op0,
716
    uint8x16_t* const oq0, uint8x16_t* const oq1) {
717
  const int8x16_t kCst3 = vdupq_n_s8(0x03);
718
  const int8x16_t kCst4 = vdupq_n_s8(0x04);
719
  const int8x16_t delta1 = vqaddq_s8(delta0, kCst4);
720
  const int8x16_t delta2 = vqaddq_s8(delta0, kCst3);
721
  const int8x16_t a1 = vshrq_n_s8(delta1, 3);
722
  const int8x16_t a2 = vshrq_n_s8(delta2, 3);
723
  const int8x16_t a3 = vrshrq_n_s8(a1, 1);   // a3 = (a1 + 1) >> 1
724
  *op0 = FlipSignBack_NEON(vqaddq_s8(p0, a2));  // clip(p0 + a2)
725
  *oq0 = FlipSignBack_NEON(vqsubq_s8(q0, a1));  // clip(q0 - a1)
726
  *op1 = FlipSignBack_NEON(vqaddq_s8(p1, a3));  // clip(p1 + a3)
727
  *oq1 = FlipSignBack_NEON(vqsubq_s8(q1, a3));  // clip(q1 - a3)
728
}
729

730
static void DoFilter4_NEON(
731
    const uint8x16_t p1, const uint8x16_t p0,
732
    const uint8x16_t q0, const uint8x16_t q1,
733
    const uint8x16_t mask, const uint8x16_t hev_mask,
734
    uint8x16_t* const op1, uint8x16_t* const op0,
735
    uint8x16_t* const oq0, uint8x16_t* const oq1) {
736
  // This is a fused version of DoFilter2() calling ApplyFilter2 directly
737
  const int8x16_t p1s = FlipSign_NEON(p1);
738
  int8x16_t p0s = FlipSign_NEON(p0);
739
  int8x16_t q0s = FlipSign_NEON(q0);
740
  const int8x16_t q1s = FlipSign_NEON(q1);
741
  const uint8x16_t simple_lf_mask = vandq_u8(mask, hev_mask);
742

743
  // do_filter2 part (simple loopfilter on pixels with hev)
744
  {
745
    const int8x16_t delta = GetBaseDelta_NEON(p1s, p0s, q0s, q1s);
746
    const int8x16_t simple_lf_delta =
747
        vandq_s8(delta, vreinterpretq_s8_u8(simple_lf_mask));
748
    ApplyFilter2NoFlip_NEON(p0s, q0s, simple_lf_delta, &p0s, &q0s);
749
  }
750

751
  // do_filter4 part (complex loopfilter on pixels without hev)
752
  {
753
    const int8x16_t delta0 = GetBaseDelta0_NEON(p0s, q0s);
754
    // we use: (mask & hev_mask) ^ mask = mask & !hev_mask
755
    const uint8x16_t complex_lf_mask = veorq_u8(simple_lf_mask, mask);
756
    const int8x16_t complex_lf_delta =
757
        vandq_s8(delta0, vreinterpretq_s8_u8(complex_lf_mask));
758
    ApplyFilter4_NEON(p1s, p0s, q0s, q1s, complex_lf_delta, op1, op0, oq0, oq1);
759
  }
760
}
761

762
//  6-points filter
763

764
static void ApplyFilter6_NEON(
765
    const int8x16_t p2, const int8x16_t p1, const int8x16_t p0,
766
    const int8x16_t q0, const int8x16_t q1, const int8x16_t q2,
767
    const int8x16_t delta,
768
    uint8x16_t* const op2, uint8x16_t* const op1, uint8x16_t* const op0,
769
    uint8x16_t* const oq0, uint8x16_t* const oq1, uint8x16_t* const oq2) {
770
  // We have to compute: X = (9*a+63) >> 7, Y = (18*a+63)>>7, Z = (27*a+63) >> 7
771
  // Turns out, there's a common sub-expression S=9 * a - 1 that can be used
772
  // with the special vqrshrn_n_s16 rounding-shift-and-narrow instruction:
773
  //   X = (S + 64) >> 7, Y = (S + 32) >> 6, Z = (18 * a + S + 64) >> 7
774
  const int8x8_t delta_lo = vget_low_s8(delta);
775
  const int8x8_t delta_hi = vget_high_s8(delta);
776
  const int8x8_t kCst9 = vdup_n_s8(9);
777
  const int16x8_t kCstm1 = vdupq_n_s16(-1);
778
  const int8x8_t kCst18 = vdup_n_s8(18);
779
  const int16x8_t S_lo = vmlal_s8(kCstm1, kCst9, delta_lo);  // S = 9 * a - 1
780
  const int16x8_t S_hi = vmlal_s8(kCstm1, kCst9, delta_hi);
781
  const int16x8_t Z_lo = vmlal_s8(S_lo, kCst18, delta_lo);   // S + 18 * a
782
  const int16x8_t Z_hi = vmlal_s8(S_hi, kCst18, delta_hi);
783
  const int8x8_t a3_lo = vqrshrn_n_s16(S_lo, 7);   // (9 * a + 63) >> 7
784
  const int8x8_t a3_hi = vqrshrn_n_s16(S_hi, 7);
785
  const int8x8_t a2_lo = vqrshrn_n_s16(S_lo, 6);   // (9 * a + 31) >> 6
786
  const int8x8_t a2_hi = vqrshrn_n_s16(S_hi, 6);
787
  const int8x8_t a1_lo = vqrshrn_n_s16(Z_lo, 7);   // (27 * a + 63) >> 7
788
  const int8x8_t a1_hi = vqrshrn_n_s16(Z_hi, 7);
789
  const int8x16_t a1 = vcombine_s8(a1_lo, a1_hi);
790
  const int8x16_t a2 = vcombine_s8(a2_lo, a2_hi);
791
  const int8x16_t a3 = vcombine_s8(a3_lo, a3_hi);
792

793
  *op0 = FlipSignBack_NEON(vqaddq_s8(p0, a1));  // clip(p0 + a1)
794
  *oq0 = FlipSignBack_NEON(vqsubq_s8(q0, a1));  // clip(q0 - q1)
795
  *oq1 = FlipSignBack_NEON(vqsubq_s8(q1, a2));  // clip(q1 - a2)
796
  *op1 = FlipSignBack_NEON(vqaddq_s8(p1, a2));  // clip(p1 + a2)
797
  *oq2 = FlipSignBack_NEON(vqsubq_s8(q2, a3));  // clip(q2 - a3)
798
  *op2 = FlipSignBack_NEON(vqaddq_s8(p2, a3));  // clip(p2 + a3)
799
}
800

801
static void DoFilter6_NEON(
802
    const uint8x16_t p2, const uint8x16_t p1, const uint8x16_t p0,
803
    const uint8x16_t q0, const uint8x16_t q1, const uint8x16_t q2,
804
    const uint8x16_t mask, const uint8x16_t hev_mask,
805
    uint8x16_t* const op2, uint8x16_t* const op1, uint8x16_t* const op0,
806
    uint8x16_t* const oq0, uint8x16_t* const oq1, uint8x16_t* const oq2) {
807
  // This is a fused version of DoFilter2() calling ApplyFilter2 directly
808
  const int8x16_t p2s = FlipSign_NEON(p2);
809
  const int8x16_t p1s = FlipSign_NEON(p1);
810
  int8x16_t p0s = FlipSign_NEON(p0);
811
  int8x16_t q0s = FlipSign_NEON(q0);
812
  const int8x16_t q1s = FlipSign_NEON(q1);
813
  const int8x16_t q2s = FlipSign_NEON(q2);
814
  const uint8x16_t simple_lf_mask = vandq_u8(mask, hev_mask);
815
  const int8x16_t delta0 = GetBaseDelta_NEON(p1s, p0s, q0s, q1s);
816

817
  // do_filter2 part (simple loopfilter on pixels with hev)
818
  {
819
    const int8x16_t simple_lf_delta =
820
        vandq_s8(delta0, vreinterpretq_s8_u8(simple_lf_mask));
821
    ApplyFilter2NoFlip_NEON(p0s, q0s, simple_lf_delta, &p0s, &q0s);
822
  }
823

824
  // do_filter6 part (complex loopfilter on pixels without hev)
825
  {
826
    // we use: (mask & hev_mask) ^ mask = mask & !hev_mask
827
    const uint8x16_t complex_lf_mask = veorq_u8(simple_lf_mask, mask);
828
    const int8x16_t complex_lf_delta =
829
        vandq_s8(delta0, vreinterpretq_s8_u8(complex_lf_mask));
830
    ApplyFilter6_NEON(p2s, p1s, p0s, q0s, q1s, q2s, complex_lf_delta,
831
                      op2, op1, op0, oq0, oq1, oq2);
832
  }
833
}
834

835
// on macroblock edges
836

837
static void VFilter16_NEON(uint8_t* p, int stride,
838
                           int thresh, int ithresh, int hev_thresh) {
839
  uint8x16_t p3, p2, p1, p0, q0, q1, q2, q3;
840
  Load16x8_NEON(p, stride, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
841
  {
842
    const uint8x16_t mask = NeedsFilter2_NEON(p3, p2, p1, p0, q0, q1, q2, q3,
843
                                              ithresh, thresh);
844
    const uint8x16_t hev_mask = NeedsHev_NEON(p1, p0, q0, q1, hev_thresh);
845
    uint8x16_t op2, op1, op0, oq0, oq1, oq2;
846
    DoFilter6_NEON(p2, p1, p0, q0, q1, q2, mask, hev_mask,
847
                   &op2, &op1, &op0, &oq0, &oq1, &oq2);
848
    Store16x2_NEON(op2, op1, p - 2 * stride, stride);
849
    Store16x2_NEON(op0, oq0, p + 0 * stride, stride);
850
    Store16x2_NEON(oq1, oq2, p + 2 * stride, stride);
851
  }
852
}
853

854
static void HFilter16_NEON(uint8_t* p, int stride,
855
                           int thresh, int ithresh, int hev_thresh) {
856
  uint8x16_t p3, p2, p1, p0, q0, q1, q2, q3;
857
  Load8x16_NEON(p, stride, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
858
  {
859
    const uint8x16_t mask = NeedsFilter2_NEON(p3, p2, p1, p0, q0, q1, q2, q3,
860
                                              ithresh, thresh);
861
    const uint8x16_t hev_mask = NeedsHev_NEON(p1, p0, q0, q1, hev_thresh);
862
    uint8x16_t op2, op1, op0, oq0, oq1, oq2;
863
    DoFilter6_NEON(p2, p1, p0, q0, q1, q2, mask, hev_mask,
864
                   &op2, &op1, &op0, &oq0, &oq1, &oq2);
865
    Store2x16_NEON(op2, op1, p - 2, stride);
866
    Store2x16_NEON(op0, oq0, p + 0, stride);
867
    Store2x16_NEON(oq1, oq2, p + 2, stride);
868
  }
869
}
870

871
// on three inner edges
872
static void VFilter16i_NEON(uint8_t* p, int stride,
873
                            int thresh, int ithresh, int hev_thresh) {
874
  uint32_t k;
875
  uint8x16_t p3, p2, p1, p0;
876
  Load16x4_NEON(p + 2  * stride, stride, &p3, &p2, &p1, &p0);
877
  for (k = 3; k != 0; --k) {
878
    uint8x16_t q0, q1, q2, q3;
879
    p += 4 * stride;
880
    Load16x4_NEON(p + 2  * stride, stride, &q0, &q1, &q2, &q3);
881
    {
882
      const uint8x16_t mask =
883
          NeedsFilter2_NEON(p3, p2, p1, p0, q0, q1, q2, q3, ithresh, thresh);
884
      const uint8x16_t hev_mask = NeedsHev_NEON(p1, p0, q0, q1, hev_thresh);
885
      // p3 and p2 are not just temporary variables here: they will be
886
      // re-used for next span. And q2/q3 will become p1/p0 accordingly.
887
      DoFilter4_NEON(p1, p0, q0, q1, mask, hev_mask, &p1, &p0, &p3, &p2);
888
      Store16x4_NEON(p1, p0, p3, p2, p, stride);
889
      p1 = q2;
890
      p0 = q3;
891
    }
892
  }
893
}
894

895
#if !defined(WORK_AROUND_GCC)
896
static void HFilter16i_NEON(uint8_t* p, int stride,
897
                            int thresh, int ithresh, int hev_thresh) {
898
  uint32_t k;
899
  uint8x16_t p3, p2, p1, p0;
900
  Load4x16_NEON(p + 2, stride, &p3, &p2, &p1, &p0);
901
  for (k = 3; k != 0; --k) {
902
    uint8x16_t q0, q1, q2, q3;
903
    p += 4;
904
    Load4x16_NEON(p + 2, stride, &q0, &q1, &q2, &q3);
905
    {
906
      const uint8x16_t mask =
907
          NeedsFilter2_NEON(p3, p2, p1, p0, q0, q1, q2, q3, ithresh, thresh);
908
      const uint8x16_t hev_mask = NeedsHev_NEON(p1, p0, q0, q1, hev_thresh);
909
      DoFilter4_NEON(p1, p0, q0, q1, mask, hev_mask, &p1, &p0, &p3, &p2);
910
      Store4x16_NEON(p1, p0, p3, p2, p, stride);
911
      p1 = q2;
912
      p0 = q3;
913
    }
914
  }
915
}
916
#endif  // !WORK_AROUND_GCC
917

918
// 8-pixels wide variant, for chroma filtering
919
static void VFilter8_NEON(uint8_t* WEBP_RESTRICT u, uint8_t* WEBP_RESTRICT v,
920
                          int stride, int thresh, int ithresh, int hev_thresh) {
921
  uint8x16_t p3, p2, p1, p0, q0, q1, q2, q3;
922
  Load8x8x2_NEON(u, v, stride, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
923
  {
924
    const uint8x16_t mask = NeedsFilter2_NEON(p3, p2, p1, p0, q0, q1, q2, q3,
925
                                              ithresh, thresh);
926
    const uint8x16_t hev_mask = NeedsHev_NEON(p1, p0, q0, q1, hev_thresh);
927
    uint8x16_t op2, op1, op0, oq0, oq1, oq2;
928
    DoFilter6_NEON(p2, p1, p0, q0, q1, q2, mask, hev_mask,
929
                   &op2, &op1, &op0, &oq0, &oq1, &oq2);
930
    Store8x2x2_NEON(op2, op1, u - 2 * stride, v - 2 * stride, stride);
931
    Store8x2x2_NEON(op0, oq0, u + 0 * stride, v + 0 * stride, stride);
932
    Store8x2x2_NEON(oq1, oq2, u + 2 * stride, v + 2 * stride, stride);
933
  }
934
}
935
static void VFilter8i_NEON(uint8_t* WEBP_RESTRICT u, uint8_t* WEBP_RESTRICT v,
936
                           int stride,
937
                           int thresh, int ithresh, int hev_thresh) {
938
  uint8x16_t p3, p2, p1, p0, q0, q1, q2, q3;
939
  u += 4 * stride;
940
  v += 4 * stride;
941
  Load8x8x2_NEON(u, v, stride, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
942
  {
943
    const uint8x16_t mask = NeedsFilter2_NEON(p3, p2, p1, p0, q0, q1, q2, q3,
944
                                              ithresh, thresh);
945
    const uint8x16_t hev_mask = NeedsHev_NEON(p1, p0, q0, q1, hev_thresh);
946
    uint8x16_t op1, op0, oq0, oq1;
947
    DoFilter4_NEON(p1, p0, q0, q1, mask, hev_mask, &op1, &op0, &oq0, &oq1);
948
    Store8x4x2_NEON(op1, op0, oq0, oq1, u, v, stride);
949
  }
950
}
951

952
#if !defined(WORK_AROUND_GCC)
953
static void HFilter8_NEON(uint8_t* WEBP_RESTRICT u, uint8_t* WEBP_RESTRICT v,
954
                          int stride, int thresh, int ithresh, int hev_thresh) {
955
  uint8x16_t p3, p2, p1, p0, q0, q1, q2, q3;
956
  Load8x8x2T_NEON(u, v, stride, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
957
  {
958
    const uint8x16_t mask = NeedsFilter2_NEON(p3, p2, p1, p0, q0, q1, q2, q3,
959
                                              ithresh, thresh);
960
    const uint8x16_t hev_mask = NeedsHev_NEON(p1, p0, q0, q1, hev_thresh);
961
    uint8x16_t op2, op1, op0, oq0, oq1, oq2;
962
    DoFilter6_NEON(p2, p1, p0, q0, q1, q2, mask, hev_mask,
963
                   &op2, &op1, &op0, &oq0, &oq1, &oq2);
964
    Store6x8x2_NEON(op2, op1, op0, oq0, oq1, oq2, u, v, stride);
965
  }
966
}
967

968
static void HFilter8i_NEON(uint8_t* WEBP_RESTRICT u, uint8_t* WEBP_RESTRICT v,
969
                           int stride,
970
                           int thresh, int ithresh, int hev_thresh) {
971
  uint8x16_t p3, p2, p1, p0, q0, q1, q2, q3;
972
  u += 4;
973
  v += 4;
974
  Load8x8x2T_NEON(u, v, stride, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
975
  {
976
    const uint8x16_t mask = NeedsFilter2_NEON(p3, p2, p1, p0, q0, q1, q2, q3,
977
                                              ithresh, thresh);
978
    const uint8x16_t hev_mask = NeedsHev_NEON(p1, p0, q0, q1, hev_thresh);
979
    uint8x16_t op1, op0, oq0, oq1;
980
    DoFilter4_NEON(p1, p0, q0, q1, mask, hev_mask, &op1, &op0, &oq0, &oq1);
981
    Store4x8x2_NEON(op1, op0, oq0, oq1, u, v, stride);
982
  }
983
}
984
#endif  // !WORK_AROUND_GCC
985

986
//-----------------------------------------------------------------------------
987
// Inverse transforms (Paragraph 14.4)
988

989
// Technically these are unsigned but vqdmulh is only available in signed.
990
// vqdmulh returns high half (effectively >> 16) but also doubles the value,
991
// changing the >> 16 to >> 15 and requiring an additional >> 1.
992
// We use this to our advantage with kC2. The canonical value is 35468.
993
// However, the high bit is set so treating it as signed will give incorrect
994
// results. We avoid this by down shifting by 1 here to clear the highest bit.
995
// Combined with the doubling effect of vqdmulh we get >> 16.
996
// This can not be applied to kC1 because the lowest bit is set. Down shifting
997
// the constant would reduce precision.
998

999
// libwebp uses a trick to avoid some extra addition that libvpx does.
1000
// Instead of:
1001
// temp2 = ip[12] + ((ip[12] * cospi8sqrt2minus1) >> 16);
1002
// libwebp adds 1 << 16 to cospi8sqrt2minus1 (kC1). However, this causes the
1003
// same issue with kC1 and vqdmulh that we work around by down shifting kC2
1004

1005
static const int16_t kC1 = WEBP_TRANSFORM_AC3_C1;
1006
static const int16_t kC2 =
1007
    WEBP_TRANSFORM_AC3_C2 / 2;  // half of kC2, actually. See comment above.
1008

1009
#if defined(WEBP_USE_INTRINSICS)
1010
static WEBP_INLINE void Transpose8x2_NEON(const int16x8_t in0,
1011
                                          const int16x8_t in1,
1012
                                          int16x8x2_t* const out) {
1013
  // a0 a1 a2 a3 | b0 b1 b2 b3   => a0 b0 c0 d0 | a1 b1 c1 d1
1014
  // c0 c1 c2 c3 | d0 d1 d2 d3      a2 b2 c2 d2 | a3 b3 c3 d3
1015
  const int16x8x2_t tmp0 = vzipq_s16(in0, in1);   // a0 c0 a1 c1 a2 c2 ...
1016
                                                  // b0 d0 b1 d1 b2 d2 ...
1017
  *out = vzipq_s16(tmp0.val[0], tmp0.val[1]);
1018
}
1019

1020
static WEBP_INLINE void TransformPass_NEON(int16x8x2_t* const rows) {
1021
  // {rows} = in0 | in4
1022
  //          in8 | in12
1023
  // B1 = in4 | in12
1024
  const int16x8_t B1 =
1025
      vcombine_s16(vget_high_s16(rows->val[0]), vget_high_s16(rows->val[1]));
1026
  // C0 = kC1 * in4 | kC1 * in12
1027
  // C1 = kC2 * in4 | kC2 * in12
1028
  const int16x8_t C0 = vsraq_n_s16(B1, vqdmulhq_n_s16(B1, kC1), 1);
1029
  const int16x8_t C1 = vqdmulhq_n_s16(B1, kC2);
1030
  const int16x4_t a = vqadd_s16(vget_low_s16(rows->val[0]),
1031
                                vget_low_s16(rows->val[1]));   // in0 + in8
1032
  const int16x4_t b = vqsub_s16(vget_low_s16(rows->val[0]),
1033
                                vget_low_s16(rows->val[1]));   // in0 - in8
1034
  // c = kC2 * in4 - kC1 * in12
1035
  // d = kC1 * in4 + kC2 * in12
1036
  const int16x4_t c = vqsub_s16(vget_low_s16(C1), vget_high_s16(C0));
1037
  const int16x4_t d = vqadd_s16(vget_low_s16(C0), vget_high_s16(C1));
1038
  const int16x8_t D0 = vcombine_s16(a, b);      // D0 = a | b
1039
  const int16x8_t D1 = vcombine_s16(d, c);      // D1 = d | c
1040
  const int16x8_t E0 = vqaddq_s16(D0, D1);      // a+d | b+c
1041
  const int16x8_t E_tmp = vqsubq_s16(D0, D1);   // a-d | b-c
1042
  const int16x8_t E1 = vcombine_s16(vget_high_s16(E_tmp), vget_low_s16(E_tmp));
1043
  Transpose8x2_NEON(E0, E1, rows);
1044
}
1045

1046
static void TransformOne_NEON(const int16_t* WEBP_RESTRICT in,
1047
                              uint8_t* WEBP_RESTRICT dst) {
1048
  int16x8x2_t rows;
1049
  INIT_VECTOR2(rows, vld1q_s16(in + 0), vld1q_s16(in + 8));
1050
  TransformPass_NEON(&rows);
1051
  TransformPass_NEON(&rows);
1052
  Add4x4_NEON(rows.val[0], rows.val[1], dst);
1053
}
1054

1055
#else
1056

1057
static void TransformOne_NEON(const int16_t* WEBP_RESTRICT in,
1058
                              uint8_t* WEBP_RESTRICT dst) {
1059
  const int kBPS = BPS;
1060
  // kC1, kC2. Padded because vld1.16 loads 8 bytes
1061
  const int16_t constants[4] = { kC1, kC2, 0, 0 };
1062
  /* Adapted from libvpx: vp8/common/arm/neon/shortidct4x4llm_neon.asm */
1063
  __asm__ volatile (
1064
    "vld1.16         {q1, q2}, [%[in]]           \n"
1065
    "vld1.16         {d0}, [%[constants]]        \n"
1066

1067
    /* d2: in[0]
1068
     * d3: in[8]
1069
     * d4: in[4]
1070
     * d5: in[12]
1071
     */
1072
    "vswp            d3, d4                      \n"
1073

1074
    /* q8 = {in[4], in[12]} * kC1 * 2 >> 16
1075
     * q9 = {in[4], in[12]} * kC2 >> 16
1076
     */
1077
    "vqdmulh.s16     q8, q2, d0[0]               \n"
1078
    "vqdmulh.s16     q9, q2, d0[1]               \n"
1079

1080
    /* d22 = a = in[0] + in[8]
1081
     * d23 = b = in[0] - in[8]
1082
     */
1083
    "vqadd.s16       d22, d2, d3                 \n"
1084
    "vqsub.s16       d23, d2, d3                 \n"
1085

1086
    /* The multiplication should be x * kC1 >> 16
1087
     * However, with vqdmulh we get x * kC1 * 2 >> 16
1088
     * (multiply, double, return high half)
1089
     * We avoided this in kC2 by pre-shifting the constant.
1090
     * q8 = in[4]/[12] * kC1 >> 16
1091
     */
1092
    "vshr.s16        q8, q8, #1                  \n"
1093

1094
    /* Add {in[4], in[12]} back after the multiplication. This is handled by
1095
     * adding 1 << 16 to kC1 in the libwebp C code.
1096
     */
1097
    "vqadd.s16       q8, q2, q8                  \n"
1098

1099
    /* d20 = c = in[4]*kC2 - in[12]*kC1
1100
     * d21 = d = in[4]*kC1 + in[12]*kC2
1101
     */
1102
    "vqsub.s16       d20, d18, d17               \n"
1103
    "vqadd.s16       d21, d19, d16               \n"
1104

1105
    /* d2 = tmp[0] = a + d
1106
     * d3 = tmp[1] = b + c
1107
     * d4 = tmp[2] = b - c
1108
     * d5 = tmp[3] = a - d
1109
     */
1110
    "vqadd.s16       d2, d22, d21                \n"
1111
    "vqadd.s16       d3, d23, d20                \n"
1112
    "vqsub.s16       d4, d23, d20                \n"
1113
    "vqsub.s16       d5, d22, d21                \n"
1114

1115
    "vzip.16         q1, q2                      \n"
1116
    "vzip.16         q1, q2                      \n"
1117

1118
    "vswp            d3, d4                      \n"
1119

1120
    /* q8 = {tmp[4], tmp[12]} * kC1 * 2 >> 16
1121
     * q9 = {tmp[4], tmp[12]} * kC2 >> 16
1122
     */
1123
    "vqdmulh.s16     q8, q2, d0[0]               \n"
1124
    "vqdmulh.s16     q9, q2, d0[1]               \n"
1125

1126
    /* d22 = a = tmp[0] + tmp[8]
1127
     * d23 = b = tmp[0] - tmp[8]
1128
     */
1129
    "vqadd.s16       d22, d2, d3                 \n"
1130
    "vqsub.s16       d23, d2, d3                 \n"
1131

1132
    /* See long winded explanations prior */
1133
    "vshr.s16        q8, q8, #1                  \n"
1134
    "vqadd.s16       q8, q2, q8                  \n"
1135

1136
    /* d20 = c = in[4]*kC2 - in[12]*kC1
1137
     * d21 = d = in[4]*kC1 + in[12]*kC2
1138
     */
1139
    "vqsub.s16       d20, d18, d17               \n"
1140
    "vqadd.s16       d21, d19, d16               \n"
1141

1142
    /* d2 = tmp[0] = a + d
1143
     * d3 = tmp[1] = b + c
1144
     * d4 = tmp[2] = b - c
1145
     * d5 = tmp[3] = a - d
1146
     */
1147
    "vqadd.s16       d2, d22, d21                \n"
1148
    "vqadd.s16       d3, d23, d20                \n"
1149
    "vqsub.s16       d4, d23, d20                \n"
1150
    "vqsub.s16       d5, d22, d21                \n"
1151

1152
    "vld1.32         d6[0], [%[dst]], %[kBPS]    \n"
1153
    "vld1.32         d6[1], [%[dst]], %[kBPS]    \n"
1154
    "vld1.32         d7[0], [%[dst]], %[kBPS]    \n"
1155
    "vld1.32         d7[1], [%[dst]], %[kBPS]    \n"
1156

1157
    "sub         %[dst], %[dst], %[kBPS], lsl #2 \n"
1158

1159
    /* (val) + 4 >> 3 */
1160
    "vrshr.s16       d2, d2, #3                  \n"
1161
    "vrshr.s16       d3, d3, #3                  \n"
1162
    "vrshr.s16       d4, d4, #3                  \n"
1163
    "vrshr.s16       d5, d5, #3                  \n"
1164

1165
    "vzip.16         q1, q2                      \n"
1166
    "vzip.16         q1, q2                      \n"
1167

1168
    /* Must accumulate before saturating */
1169
    "vmovl.u8        q8, d6                      \n"
1170
    "vmovl.u8        q9, d7                      \n"
1171

1172
    "vqadd.s16       q1, q1, q8                  \n"
1173
    "vqadd.s16       q2, q2, q9                  \n"
1174

1175
    "vqmovun.s16     d0, q1                      \n"
1176
    "vqmovun.s16     d1, q2                      \n"
1177

1178
    "vst1.32         d0[0], [%[dst]], %[kBPS]    \n"
1179
    "vst1.32         d0[1], [%[dst]], %[kBPS]    \n"
1180
    "vst1.32         d1[0], [%[dst]], %[kBPS]    \n"
1181
    "vst1.32         d1[1], [%[dst]]             \n"
1182

1183
    : [in] "+r"(in), [dst] "+r"(dst)  /* modified registers */
1184
    : [kBPS] "r"(kBPS), [constants] "r"(constants)  /* constants */
1185
    : "memory", "q0", "q1", "q2", "q8", "q9", "q10", "q11"  /* clobbered */
1186
  );
1187
}
1188

1189
#endif    // WEBP_USE_INTRINSICS
1190

1191
static void TransformTwo_NEON(const int16_t* WEBP_RESTRICT in,
1192
                              uint8_t* WEBP_RESTRICT dst, int do_two) {
1193
  TransformOne_NEON(in, dst);
1194
  if (do_two) {
1195
    TransformOne_NEON(in + 16, dst + 4);
1196
  }
1197
}
1198

1199
static void TransformDC_NEON(const int16_t* WEBP_RESTRICT in,
1200
                             uint8_t* WEBP_RESTRICT dst) {
1201
  const int16x8_t DC = vdupq_n_s16(in[0]);
1202
  Add4x4_NEON(DC, DC, dst);
1203
}
1204

1205
//------------------------------------------------------------------------------
1206

1207
#define STORE_WHT(dst, col, rows) do {                  \
1208
  *dst = vgetq_lane_s32(rows.val[0], col); (dst) += 16; \
1209
  *dst = vgetq_lane_s32(rows.val[1], col); (dst) += 16; \
1210
  *dst = vgetq_lane_s32(rows.val[2], col); (dst) += 16; \
1211
  *dst = vgetq_lane_s32(rows.val[3], col); (dst) += 16; \
1212
} while (0)
1213

1214
static void TransformWHT_NEON(const int16_t* WEBP_RESTRICT in,
1215
                              int16_t* WEBP_RESTRICT out) {
1216
  int32x4x4_t tmp;
1217

1218
  {
1219
    // Load the source.
1220
    const int16x4_t in00_03 = vld1_s16(in + 0);
1221
    const int16x4_t in04_07 = vld1_s16(in + 4);
1222
    const int16x4_t in08_11 = vld1_s16(in + 8);
1223
    const int16x4_t in12_15 = vld1_s16(in + 12);
1224
    const int32x4_t a0 = vaddl_s16(in00_03, in12_15);  // in[0..3] + in[12..15]
1225
    const int32x4_t a1 = vaddl_s16(in04_07, in08_11);  // in[4..7] + in[8..11]
1226
    const int32x4_t a2 = vsubl_s16(in04_07, in08_11);  // in[4..7] - in[8..11]
1227
    const int32x4_t a3 = vsubl_s16(in00_03, in12_15);  // in[0..3] - in[12..15]
1228
    tmp.val[0] = vaddq_s32(a0, a1);
1229
    tmp.val[1] = vaddq_s32(a3, a2);
1230
    tmp.val[2] = vsubq_s32(a0, a1);
1231
    tmp.val[3] = vsubq_s32(a3, a2);
1232
    // Arrange the temporary results column-wise.
1233
    tmp = Transpose4x4_NEON(tmp);
1234
  }
1235

1236
  {
1237
    const int32x4_t kCst3 = vdupq_n_s32(3);
1238
    const int32x4_t dc = vaddq_s32(tmp.val[0], kCst3);  // add rounder
1239
    const int32x4_t a0 = vaddq_s32(dc, tmp.val[3]);
1240
    const int32x4_t a1 = vaddq_s32(tmp.val[1], tmp.val[2]);
1241
    const int32x4_t a2 = vsubq_s32(tmp.val[1], tmp.val[2]);
1242
    const int32x4_t a3 = vsubq_s32(dc, tmp.val[3]);
1243

1244
    tmp.val[0] = vaddq_s32(a0, a1);
1245
    tmp.val[1] = vaddq_s32(a3, a2);
1246
    tmp.val[2] = vsubq_s32(a0, a1);
1247
    tmp.val[3] = vsubq_s32(a3, a2);
1248

1249
    // right shift the results by 3.
1250
    tmp.val[0] = vshrq_n_s32(tmp.val[0], 3);
1251
    tmp.val[1] = vshrq_n_s32(tmp.val[1], 3);
1252
    tmp.val[2] = vshrq_n_s32(tmp.val[2], 3);
1253
    tmp.val[3] = vshrq_n_s32(tmp.val[3], 3);
1254

1255
    STORE_WHT(out, 0, tmp);
1256
    STORE_WHT(out, 1, tmp);
1257
    STORE_WHT(out, 2, tmp);
1258
    STORE_WHT(out, 3, tmp);
1259
  }
1260
}
1261

1262
#undef STORE_WHT
1263

1264
//------------------------------------------------------------------------------
1265

1266
static void TransformAC3_NEON(const int16_t* WEBP_RESTRICT in,
1267
                              uint8_t* WEBP_RESTRICT dst) {
1268
  const int16x4_t A = vld1_dup_s16(in);
1269
  const int16x4_t c4 = vdup_n_s16(WEBP_TRANSFORM_AC3_MUL2(in[4]));
1270
  const int16x4_t d4 = vdup_n_s16(WEBP_TRANSFORM_AC3_MUL1(in[4]));
1271
  const int c1 = WEBP_TRANSFORM_AC3_MUL2(in[1]);
1272
  const int d1 = WEBP_TRANSFORM_AC3_MUL1(in[1]);
1273
  const uint64_t cd = (uint64_t)( d1 & 0xffff) <<  0 |
1274
                      (uint64_t)( c1 & 0xffff) << 16 |
1275
                      (uint64_t)(-c1 & 0xffff) << 32 |
1276
                      (uint64_t)(-d1 & 0xffff) << 48;
1277
  const int16x4_t CD = vcreate_s16(cd);
1278
  const int16x4_t B = vqadd_s16(A, CD);
1279
  const int16x8_t m0_m1 = vcombine_s16(vqadd_s16(B, d4), vqadd_s16(B, c4));
1280
  const int16x8_t m2_m3 = vcombine_s16(vqsub_s16(B, c4), vqsub_s16(B, d4));
1281
  Add4x4_NEON(m0_m1, m2_m3, dst);
1282
}
1283

1284
//------------------------------------------------------------------------------
1285
// 4x4
1286

1287
static void DC4_NEON(uint8_t* dst) {    // DC
1288
  const uint8x8_t A = vld1_u8(dst - BPS);  // top row
1289
  const uint16x4_t p0 = vpaddl_u8(A);  // cascading summation of the top
1290
  const uint16x4_t p1 = vpadd_u16(p0, p0);
1291
  const uint8x8_t L0 = vld1_u8(dst + 0 * BPS - 1);
1292
  const uint8x8_t L1 = vld1_u8(dst + 1 * BPS - 1);
1293
  const uint8x8_t L2 = vld1_u8(dst + 2 * BPS - 1);
1294
  const uint8x8_t L3 = vld1_u8(dst + 3 * BPS - 1);
1295
  const uint16x8_t s0 = vaddl_u8(L0, L1);
1296
  const uint16x8_t s1 = vaddl_u8(L2, L3);
1297
  const uint16x8_t s01 = vaddq_u16(s0, s1);
1298
  const uint16x8_t sum = vaddq_u16(s01, vcombine_u16(p1, p1));
1299
  const uint8x8_t dc0 = vrshrn_n_u16(sum, 3);  // (sum + 4) >> 3
1300
  const uint8x8_t dc = vdup_lane_u8(dc0, 0);
1301
  int i;
1302
  for (i = 0; i < 4; ++i) {
1303
    vst1_lane_u32((uint32_t*)(dst + i * BPS), vreinterpret_u32_u8(dc), 0);
1304
  }
1305
}
1306

1307
// TrueMotion (4x4 + 8x8)
1308
static WEBP_INLINE void TrueMotion_NEON(uint8_t* dst, int size) {
1309
  const uint8x8_t TL = vld1_dup_u8(dst - BPS - 1);  // top-left pixel 'A[-1]'
1310
  const uint8x8_t T = vld1_u8(dst - BPS);  // top row 'A[0..3]'
1311
  const uint16x8_t d = vsubl_u8(T, TL);  // A[c] - A[-1]
1312
  int y;
1313
  for (y = 0; y < size; y += 4) {
1314
    // left edge
1315
    const uint8x8_t L0 = vld1_dup_u8(dst + 0 * BPS - 1);
1316
    const uint8x8_t L1 = vld1_dup_u8(dst + 1 * BPS - 1);
1317
    const uint8x8_t L2 = vld1_dup_u8(dst + 2 * BPS - 1);
1318
    const uint8x8_t L3 = vld1_dup_u8(dst + 3 * BPS - 1);
1319
    // L[r] + A[c] - A[-1]
1320
    const int16x8_t r0 = vreinterpretq_s16_u16(vaddw_u8(d, L0));
1321
    const int16x8_t r1 = vreinterpretq_s16_u16(vaddw_u8(d, L1));
1322
    const int16x8_t r2 = vreinterpretq_s16_u16(vaddw_u8(d, L2));
1323
    const int16x8_t r3 = vreinterpretq_s16_u16(vaddw_u8(d, L3));
1324
    // Saturate and store the result.
1325
    const uint32x2_t r0_u32 = vreinterpret_u32_u8(vqmovun_s16(r0));
1326
    const uint32x2_t r1_u32 = vreinterpret_u32_u8(vqmovun_s16(r1));
1327
    const uint32x2_t r2_u32 = vreinterpret_u32_u8(vqmovun_s16(r2));
1328
    const uint32x2_t r3_u32 = vreinterpret_u32_u8(vqmovun_s16(r3));
1329
    if (size == 4) {
1330
      vst1_lane_u32((uint32_t*)(dst + 0 * BPS), r0_u32, 0);
1331
      vst1_lane_u32((uint32_t*)(dst + 1 * BPS), r1_u32, 0);
1332
      vst1_lane_u32((uint32_t*)(dst + 2 * BPS), r2_u32, 0);
1333
      vst1_lane_u32((uint32_t*)(dst + 3 * BPS), r3_u32, 0);
1334
    } else {
1335
      vst1_u32((uint32_t*)(dst + 0 * BPS), r0_u32);
1336
      vst1_u32((uint32_t*)(dst + 1 * BPS), r1_u32);
1337
      vst1_u32((uint32_t*)(dst + 2 * BPS), r2_u32);
1338
      vst1_u32((uint32_t*)(dst + 3 * BPS), r3_u32);
1339
    }
1340
    dst += 4 * BPS;
1341
  }
1342
}
1343

1344
static void TM4_NEON(uint8_t* dst) { TrueMotion_NEON(dst, 4); }
1345

1346
static void VE4_NEON(uint8_t* dst) {    // vertical
1347
  // NB: avoid vld1_u64 here as an alignment hint may be added -> SIGBUS.
1348
  const uint64x1_t A0 = vreinterpret_u64_u8(vld1_u8(dst - BPS - 1));  // top row
1349
  const uint64x1_t A1 = vshr_n_u64(A0, 8);
1350
  const uint64x1_t A2 = vshr_n_u64(A0, 16);
1351
  const uint8x8_t ABCDEFGH = vreinterpret_u8_u64(A0);
1352
  const uint8x8_t BCDEFGH0 = vreinterpret_u8_u64(A1);
1353
  const uint8x8_t CDEFGH00 = vreinterpret_u8_u64(A2);
1354
  const uint8x8_t b = vhadd_u8(ABCDEFGH, CDEFGH00);
1355
  const uint8x8_t avg = vrhadd_u8(b, BCDEFGH0);
1356
  int i;
1357
  for (i = 0; i < 4; ++i) {
1358
    vst1_lane_u32((uint32_t*)(dst + i * BPS), vreinterpret_u32_u8(avg), 0);
1359
  }
1360
}
1361

1362
static void RD4_NEON(uint8_t* dst) {   // Down-right
1363
  const uint8x8_t XABCD_u8 = vld1_u8(dst - BPS - 1);
1364
  const uint64x1_t XABCD = vreinterpret_u64_u8(XABCD_u8);
1365
  const uint64x1_t ____XABC = vshl_n_u64(XABCD, 32);
1366
  const uint32_t I = dst[-1 + 0 * BPS];
1367
  const uint32_t J = dst[-1 + 1 * BPS];
1368
  const uint32_t K = dst[-1 + 2 * BPS];
1369
  const uint32_t L = dst[-1 + 3 * BPS];
1370
  const uint64x1_t LKJI____ =
1371
      vcreate_u64((uint64_t)L | (K << 8) | (J << 16) | (I << 24));
1372
  const uint64x1_t LKJIXABC = vorr_u64(LKJI____, ____XABC);
1373
  const uint8x8_t KJIXABC_ = vreinterpret_u8_u64(vshr_n_u64(LKJIXABC, 8));
1374
  const uint8x8_t JIXABC__ = vreinterpret_u8_u64(vshr_n_u64(LKJIXABC, 16));
1375
  const uint8_t D = vget_lane_u8(XABCD_u8, 4);
1376
  const uint8x8_t JIXABCD_ = vset_lane_u8(D, JIXABC__, 6);
1377
  const uint8x8_t LKJIXABC_u8 = vreinterpret_u8_u64(LKJIXABC);
1378
  const uint8x8_t avg1 = vhadd_u8(JIXABCD_, LKJIXABC_u8);
1379
  const uint8x8_t avg2 = vrhadd_u8(avg1, KJIXABC_);
1380
  const uint64x1_t avg2_u64 = vreinterpret_u64_u8(avg2);
1381
  const uint32x2_t r3 = vreinterpret_u32_u8(avg2);
1382
  const uint32x2_t r2 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 8));
1383
  const uint32x2_t r1 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 16));
1384
  const uint32x2_t r0 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 24));
1385
  vst1_lane_u32((uint32_t*)(dst + 0 * BPS), r0, 0);
1386
  vst1_lane_u32((uint32_t*)(dst + 1 * BPS), r1, 0);
1387
  vst1_lane_u32((uint32_t*)(dst + 2 * BPS), r2, 0);
1388
  vst1_lane_u32((uint32_t*)(dst + 3 * BPS), r3, 0);
1389
}
1390

1391
static void LD4_NEON(uint8_t* dst) {    // Down-left
1392
  // Note using the same shift trick as VE4() is slower here.
1393
  const uint8x8_t ABCDEFGH = vld1_u8(dst - BPS + 0);
1394
  const uint8x8_t BCDEFGH0 = vld1_u8(dst - BPS + 1);
1395
  const uint8x8_t CDEFGH00 = vld1_u8(dst - BPS + 2);
1396
  const uint8x8_t CDEFGHH0 = vset_lane_u8(dst[-BPS + 7], CDEFGH00, 6);
1397
  const uint8x8_t avg1 = vhadd_u8(ABCDEFGH, CDEFGHH0);
1398
  const uint8x8_t avg2 = vrhadd_u8(avg1, BCDEFGH0);
1399
  const uint64x1_t avg2_u64 = vreinterpret_u64_u8(avg2);
1400
  const uint32x2_t r0 = vreinterpret_u32_u8(avg2);
1401
  const uint32x2_t r1 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 8));
1402
  const uint32x2_t r2 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 16));
1403
  const uint32x2_t r3 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 24));
1404
  vst1_lane_u32((uint32_t*)(dst + 0 * BPS), r0, 0);
1405
  vst1_lane_u32((uint32_t*)(dst + 1 * BPS), r1, 0);
1406
  vst1_lane_u32((uint32_t*)(dst + 2 * BPS), r2, 0);
1407
  vst1_lane_u32((uint32_t*)(dst + 3 * BPS), r3, 0);
1408
}
1409

1410
//------------------------------------------------------------------------------
1411
// Chroma
1412

1413
static void VE8uv_NEON(uint8_t* dst) {    // vertical
1414
  const uint8x8_t top = vld1_u8(dst - BPS);
1415
  int j;
1416
  for (j = 0; j < 8; ++j) {
1417
    vst1_u8(dst + j * BPS, top);
1418
  }
1419
}
1420

1421
static void HE8uv_NEON(uint8_t* dst) {    // horizontal
1422
  int j;
1423
  for (j = 0; j < 8; ++j) {
1424
    const uint8x8_t left = vld1_dup_u8(dst - 1);
1425
    vst1_u8(dst, left);
1426
    dst += BPS;
1427
  }
1428
}
1429

1430
static WEBP_INLINE void DC8_NEON(uint8_t* dst, int do_top, int do_left) {
1431
  uint16x8_t sum_top;
1432
  uint16x8_t sum_left;
1433
  uint8x8_t dc0;
1434

1435
  if (do_top) {
1436
    const uint8x8_t A = vld1_u8(dst - BPS);  // top row
1437
#if WEBP_AARCH64
1438
    const uint16_t p2 = vaddlv_u8(A);
1439
    sum_top = vdupq_n_u16(p2);
1440
#else
1441
    const uint16x4_t p0 = vpaddl_u8(A);  // cascading summation of the top
1442
    const uint16x4_t p1 = vpadd_u16(p0, p0);
1443
    const uint16x4_t p2 = vpadd_u16(p1, p1);
1444
    sum_top = vcombine_u16(p2, p2);
1445
#endif
1446
  }
1447

1448
  if (do_left) {
1449
    const uint8x8_t L0 = vld1_u8(dst + 0 * BPS - 1);
1450
    const uint8x8_t L1 = vld1_u8(dst + 1 * BPS - 1);
1451
    const uint8x8_t L2 = vld1_u8(dst + 2 * BPS - 1);
1452
    const uint8x8_t L3 = vld1_u8(dst + 3 * BPS - 1);
1453
    const uint8x8_t L4 = vld1_u8(dst + 4 * BPS - 1);
1454
    const uint8x8_t L5 = vld1_u8(dst + 5 * BPS - 1);
1455
    const uint8x8_t L6 = vld1_u8(dst + 6 * BPS - 1);
1456
    const uint8x8_t L7 = vld1_u8(dst + 7 * BPS - 1);
1457
    const uint16x8_t s0 = vaddl_u8(L0, L1);
1458
    const uint16x8_t s1 = vaddl_u8(L2, L3);
1459
    const uint16x8_t s2 = vaddl_u8(L4, L5);
1460
    const uint16x8_t s3 = vaddl_u8(L6, L7);
1461
    const uint16x8_t s01 = vaddq_u16(s0, s1);
1462
    const uint16x8_t s23 = vaddq_u16(s2, s3);
1463
    sum_left = vaddq_u16(s01, s23);
1464
  }
1465

1466
  if (do_top && do_left) {
1467
    const uint16x8_t sum = vaddq_u16(sum_left, sum_top);
1468
    dc0 = vrshrn_n_u16(sum, 4);
1469
  } else if (do_top) {
1470
    dc0 = vrshrn_n_u16(sum_top, 3);
1471
  } else if (do_left) {
1472
    dc0 = vrshrn_n_u16(sum_left, 3);
1473
  } else {
1474
    dc0 = vdup_n_u8(0x80);
1475
  }
1476

1477
  {
1478
    const uint8x8_t dc = vdup_lane_u8(dc0, 0);
1479
    int i;
1480
    for (i = 0; i < 8; ++i) {
1481
      vst1_u32((uint32_t*)(dst + i * BPS), vreinterpret_u32_u8(dc));
1482
    }
1483
  }
1484
}
1485

1486
static void DC8uv_NEON(uint8_t* dst) { DC8_NEON(dst, 1, 1); }
1487
static void DC8uvNoTop_NEON(uint8_t* dst) { DC8_NEON(dst, 0, 1); }
1488
static void DC8uvNoLeft_NEON(uint8_t* dst) { DC8_NEON(dst, 1, 0); }
1489
static void DC8uvNoTopLeft_NEON(uint8_t* dst) { DC8_NEON(dst, 0, 0); }
1490

1491
static void TM8uv_NEON(uint8_t* dst) { TrueMotion_NEON(dst, 8); }
1492

1493
//------------------------------------------------------------------------------
1494
// 16x16
1495

1496
static void VE16_NEON(uint8_t* dst) {     // vertical
1497
  const uint8x16_t top = vld1q_u8(dst - BPS);
1498
  int j;
1499
  for (j = 0; j < 16; ++j) {
1500
    vst1q_u8(dst + j * BPS, top);
1501
  }
1502
}
1503

1504
static void HE16_NEON(uint8_t* dst) {     // horizontal
1505
  int j;
1506
  for (j = 0; j < 16; ++j) {
1507
    const uint8x16_t left = vld1q_dup_u8(dst - 1);
1508
    vst1q_u8(dst, left);
1509
    dst += BPS;
1510
  }
1511
}
1512

1513
static WEBP_INLINE void DC16_NEON(uint8_t* dst, int do_top, int do_left) {
1514
  uint16x8_t sum_top;
1515
  uint16x8_t sum_left;
1516
  uint8x8_t dc0;
1517

1518
  if (do_top) {
1519
    const uint8x16_t A = vld1q_u8(dst - BPS);  // top row
1520
#if WEBP_AARCH64
1521
    const uint16_t p3 = vaddlvq_u8(A);
1522
    sum_top = vdupq_n_u16(p3);
1523
#else
1524
    const uint16x8_t p0 = vpaddlq_u8(A);  // cascading summation of the top
1525
    const uint16x4_t p1 = vadd_u16(vget_low_u16(p0), vget_high_u16(p0));
1526
    const uint16x4_t p2 = vpadd_u16(p1, p1);
1527
    const uint16x4_t p3 = vpadd_u16(p2, p2);
1528
    sum_top = vcombine_u16(p3, p3);
1529
#endif
1530
  }
1531

1532
  if (do_left) {
1533
    int i;
1534
    sum_left = vdupq_n_u16(0);
1535
    for (i = 0; i < 16; i += 8) {
1536
      const uint8x8_t L0 = vld1_u8(dst + (i + 0) * BPS - 1);
1537
      const uint8x8_t L1 = vld1_u8(dst + (i + 1) * BPS - 1);
1538
      const uint8x8_t L2 = vld1_u8(dst + (i + 2) * BPS - 1);
1539
      const uint8x8_t L3 = vld1_u8(dst + (i + 3) * BPS - 1);
1540
      const uint8x8_t L4 = vld1_u8(dst + (i + 4) * BPS - 1);
1541
      const uint8x8_t L5 = vld1_u8(dst + (i + 5) * BPS - 1);
1542
      const uint8x8_t L6 = vld1_u8(dst + (i + 6) * BPS - 1);
1543
      const uint8x8_t L7 = vld1_u8(dst + (i + 7) * BPS - 1);
1544
      const uint16x8_t s0 = vaddl_u8(L0, L1);
1545
      const uint16x8_t s1 = vaddl_u8(L2, L3);
1546
      const uint16x8_t s2 = vaddl_u8(L4, L5);
1547
      const uint16x8_t s3 = vaddl_u8(L6, L7);
1548
      const uint16x8_t s01 = vaddq_u16(s0, s1);
1549
      const uint16x8_t s23 = vaddq_u16(s2, s3);
1550
      const uint16x8_t sum = vaddq_u16(s01, s23);
1551
      sum_left = vaddq_u16(sum_left, sum);
1552
    }
1553
  }
1554

1555
  if (do_top && do_left) {
1556
    const uint16x8_t sum = vaddq_u16(sum_left, sum_top);
1557
    dc0 = vrshrn_n_u16(sum, 5);
1558
  } else if (do_top) {
1559
    dc0 = vrshrn_n_u16(sum_top, 4);
1560
  } else if (do_left) {
1561
    dc0 = vrshrn_n_u16(sum_left, 4);
1562
  } else {
1563
    dc0 = vdup_n_u8(0x80);
1564
  }
1565

1566
  {
1567
    const uint8x16_t dc = vdupq_lane_u8(dc0, 0);
1568
    int i;
1569
    for (i = 0; i < 16; ++i) {
1570
      vst1q_u8(dst + i * BPS, dc);
1571
    }
1572
  }
1573
}
1574

1575
static void DC16TopLeft_NEON(uint8_t* dst) { DC16_NEON(dst, 1, 1); }
1576
static void DC16NoTop_NEON(uint8_t* dst) { DC16_NEON(dst, 0, 1); }
1577
static void DC16NoLeft_NEON(uint8_t* dst) { DC16_NEON(dst, 1, 0); }
1578
static void DC16NoTopLeft_NEON(uint8_t* dst) { DC16_NEON(dst, 0, 0); }
1579

1580
static void TM16_NEON(uint8_t* dst) {
1581
  const uint8x8_t TL = vld1_dup_u8(dst - BPS - 1);  // top-left pixel 'A[-1]'
1582
  const uint8x16_t T = vld1q_u8(dst - BPS);  // top row 'A[0..15]'
1583
  // A[c] - A[-1]
1584
  const uint16x8_t d_lo = vsubl_u8(vget_low_u8(T), TL);
1585
  const uint16x8_t d_hi = vsubl_u8(vget_high_u8(T), TL);
1586
  int y;
1587
  for (y = 0; y < 16; y += 4) {
1588
    // left edge
1589
    const uint8x8_t L0 = vld1_dup_u8(dst + 0 * BPS - 1);
1590
    const uint8x8_t L1 = vld1_dup_u8(dst + 1 * BPS - 1);
1591
    const uint8x8_t L2 = vld1_dup_u8(dst + 2 * BPS - 1);
1592
    const uint8x8_t L3 = vld1_dup_u8(dst + 3 * BPS - 1);
1593
    // L[r] + A[c] - A[-1]
1594
    const int16x8_t r0_lo = vreinterpretq_s16_u16(vaddw_u8(d_lo, L0));
1595
    const int16x8_t r1_lo = vreinterpretq_s16_u16(vaddw_u8(d_lo, L1));
1596
    const int16x8_t r2_lo = vreinterpretq_s16_u16(vaddw_u8(d_lo, L2));
1597
    const int16x8_t r3_lo = vreinterpretq_s16_u16(vaddw_u8(d_lo, L3));
1598
    const int16x8_t r0_hi = vreinterpretq_s16_u16(vaddw_u8(d_hi, L0));
1599
    const int16x8_t r1_hi = vreinterpretq_s16_u16(vaddw_u8(d_hi, L1));
1600
    const int16x8_t r2_hi = vreinterpretq_s16_u16(vaddw_u8(d_hi, L2));
1601
    const int16x8_t r3_hi = vreinterpretq_s16_u16(vaddw_u8(d_hi, L3));
1602
    // Saturate and store the result.
1603
    const uint8x16_t row0 = vcombine_u8(vqmovun_s16(r0_lo), vqmovun_s16(r0_hi));
1604
    const uint8x16_t row1 = vcombine_u8(vqmovun_s16(r1_lo), vqmovun_s16(r1_hi));
1605
    const uint8x16_t row2 = vcombine_u8(vqmovun_s16(r2_lo), vqmovun_s16(r2_hi));
1606
    const uint8x16_t row3 = vcombine_u8(vqmovun_s16(r3_lo), vqmovun_s16(r3_hi));
1607
    vst1q_u8(dst + 0 * BPS, row0);
1608
    vst1q_u8(dst + 1 * BPS, row1);
1609
    vst1q_u8(dst + 2 * BPS, row2);
1610
    vst1q_u8(dst + 3 * BPS, row3);
1611
    dst += 4 * BPS;
1612
  }
1613
}
1614

1615
//------------------------------------------------------------------------------
1616
// Entry point
1617

1618
extern void VP8DspInitNEON(void);
1619

1620
WEBP_TSAN_IGNORE_FUNCTION void VP8DspInitNEON(void) {
1621
  VP8Transform = TransformTwo_NEON;
1622
  VP8TransformAC3 = TransformAC3_NEON;
1623
  VP8TransformDC = TransformDC_NEON;
1624
  VP8TransformWHT = TransformWHT_NEON;
1625

1626
  VP8VFilter16 = VFilter16_NEON;
1627
  VP8VFilter16i = VFilter16i_NEON;
1628
  VP8HFilter16 = HFilter16_NEON;
1629
#if !defined(WORK_AROUND_GCC)
1630
  VP8HFilter16i = HFilter16i_NEON;
1631
#endif
1632
  VP8VFilter8 = VFilter8_NEON;
1633
  VP8VFilter8i = VFilter8i_NEON;
1634
#if !defined(WORK_AROUND_GCC)
1635
  VP8HFilter8 = HFilter8_NEON;
1636
  VP8HFilter8i = HFilter8i_NEON;
1637
#endif
1638
  VP8SimpleVFilter16 = SimpleVFilter16_NEON;
1639
  VP8SimpleHFilter16 = SimpleHFilter16_NEON;
1640
  VP8SimpleVFilter16i = SimpleVFilter16i_NEON;
1641
  VP8SimpleHFilter16i = SimpleHFilter16i_NEON;
1642

1643
  VP8PredLuma4[0] = DC4_NEON;
1644
  VP8PredLuma4[1] = TM4_NEON;
1645
  VP8PredLuma4[2] = VE4_NEON;
1646
  VP8PredLuma4[4] = RD4_NEON;
1647
  VP8PredLuma4[6] = LD4_NEON;
1648

1649
  VP8PredLuma16[0] = DC16TopLeft_NEON;
1650
  VP8PredLuma16[1] = TM16_NEON;
1651
  VP8PredLuma16[2] = VE16_NEON;
1652
  VP8PredLuma16[3] = HE16_NEON;
1653
  VP8PredLuma16[4] = DC16NoTop_NEON;
1654
  VP8PredLuma16[5] = DC16NoLeft_NEON;
1655
  VP8PredLuma16[6] = DC16NoTopLeft_NEON;
1656

1657
  VP8PredChroma8[0] = DC8uv_NEON;
1658
  VP8PredChroma8[1] = TM8uv_NEON;
1659
  VP8PredChroma8[2] = VE8uv_NEON;
1660
  VP8PredChroma8[3] = HE8uv_NEON;
1661
  VP8PredChroma8[4] = DC8uvNoTop_NEON;
1662
  VP8PredChroma8[5] = DC8uvNoLeft_NEON;
1663
  VP8PredChroma8[6] = DC8uvNoTopLeft_NEON;
1664
}
1665

1666
#else  // !WEBP_USE_NEON
1667

1668
WEBP_DSP_INIT_STUB(VP8DspInitNEON)
1669

1670
#endif  // WEBP_USE_NEON
1671

1672
Product

Resources

Company