CoCalc -- dec

GitHub Repository: Tetragramm/opencv
Path: blob/master/3rdparty/libwebp/src/dsp/dec_neon.c
¹⁶³⁴⁸ views
1
// Copyright 2012 Google Inc. All Rights Reserved.
2
//
3
// Use of this source code is governed by a BSD-style license
4
// that can be found in the COPYING file in the root of the source
5
// tree. An additional intellectual property rights grant can be found
6
// in the file PATENTS. All contributing project authors may
7
// be found in the AUTHORS file in the root of the source tree.
8
// -----------------------------------------------------------------------------
9
//
10
// ARM NEON version of dsp functions and loop filtering.
11
//
12
// Authors: Somnath Banerjee ([email protected])
13
//          Johann Koenig ([email protected])
14

15
#include "src/dsp/dsp.h"
16

17
#if defined(WEBP_USE_NEON)
18

19
#include "src/dsp/neon.h"
20
#include "src/dec/vp8i_dec.h"
21

22
//------------------------------------------------------------------------------
23
// NxM Loading functions
24

25
#if !defined(WORK_AROUND_GCC)
26

27
// This intrinsics version makes gcc-4.6.3 crash during Load4x??() compilation
28
// (register alloc, probably). The variants somewhat mitigate the problem, but
29
// not quite. HFilter16i() remains problematic.
30
static WEBP_INLINE uint8x8x4_t Load4x8_NEON(const uint8_t* const src,
31
                                            int stride) {
32
  const uint8x8_t zero = vdup_n_u8(0);
33
  uint8x8x4_t out;
34
  INIT_VECTOR4(out, zero, zero, zero, zero);
35
  out = vld4_lane_u8(src + 0 * stride, out, 0);
36
  out = vld4_lane_u8(src + 1 * stride, out, 1);
37
  out = vld4_lane_u8(src + 2 * stride, out, 2);
38
  out = vld4_lane_u8(src + 3 * stride, out, 3);
39
  out = vld4_lane_u8(src + 4 * stride, out, 4);
40
  out = vld4_lane_u8(src + 5 * stride, out, 5);
41
  out = vld4_lane_u8(src + 6 * stride, out, 6);
42
  out = vld4_lane_u8(src + 7 * stride, out, 7);
43
  return out;
44
}
45

46
static WEBP_INLINE void Load4x16_NEON(const uint8_t* const src, int stride,
47
                                      uint8x16_t* const p1,
48
                                      uint8x16_t* const p0,
49
                                      uint8x16_t* const q0,
50
                                      uint8x16_t* const q1) {
51
  // row0 = p1[0..7]|p0[0..7]|q0[0..7]|q1[0..7]
52
  // row8 = p1[8..15]|p0[8..15]|q0[8..15]|q1[8..15]
53
  const uint8x8x4_t row0 = Load4x8_NEON(src - 2 + 0 * stride, stride);
54
  const uint8x8x4_t row8 = Load4x8_NEON(src - 2 + 8 * stride, stride);
55
  *p1 = vcombine_u8(row0.val[0], row8.val[0]);
56
  *p0 = vcombine_u8(row0.val[1], row8.val[1]);
57
  *q0 = vcombine_u8(row0.val[2], row8.val[2]);
58
  *q1 = vcombine_u8(row0.val[3], row8.val[3]);
59
}
60

61
#else  // WORK_AROUND_GCC
62

63
#define LOADQ_LANE_32b(VALUE, LANE) do {                             \
64
  (VALUE) = vld1q_lane_u32((const uint32_t*)src, (VALUE), (LANE));   \
65
  src += stride;                                                     \
66
} while (0)
67

68
static WEBP_INLINE void Load4x16_NEON(const uint8_t* src, int stride,
69
                                      uint8x16_t* const p1,
70
                                      uint8x16_t* const p0,
71
                                      uint8x16_t* const q0,
72
                                      uint8x16_t* const q1) {
73
  const uint32x4_t zero = vdupq_n_u32(0);
74
  uint32x4x4_t in;
75
  INIT_VECTOR4(in, zero, zero, zero, zero);
76
  src -= 2;
77
  LOADQ_LANE_32b(in.val[0], 0);
78
  LOADQ_LANE_32b(in.val[1], 0);
79
  LOADQ_LANE_32b(in.val[2], 0);
80
  LOADQ_LANE_32b(in.val[3], 0);
81
  LOADQ_LANE_32b(in.val[0], 1);
82
  LOADQ_LANE_32b(in.val[1], 1);
83
  LOADQ_LANE_32b(in.val[2], 1);
84
  LOADQ_LANE_32b(in.val[3], 1);
85
  LOADQ_LANE_32b(in.val[0], 2);
86
  LOADQ_LANE_32b(in.val[1], 2);
87
  LOADQ_LANE_32b(in.val[2], 2);
88
  LOADQ_LANE_32b(in.val[3], 2);
89
  LOADQ_LANE_32b(in.val[0], 3);
90
  LOADQ_LANE_32b(in.val[1], 3);
91
  LOADQ_LANE_32b(in.val[2], 3);
92
  LOADQ_LANE_32b(in.val[3], 3);
93
  // Transpose four 4x4 parts:
94
  {
95
    const uint8x16x2_t row01 = vtrnq_u8(vreinterpretq_u8_u32(in.val[0]),
96
                                        vreinterpretq_u8_u32(in.val[1]));
97
    const uint8x16x2_t row23 = vtrnq_u8(vreinterpretq_u8_u32(in.val[2]),
98
                                        vreinterpretq_u8_u32(in.val[3]));
99
    const uint16x8x2_t row02 = vtrnq_u16(vreinterpretq_u16_u8(row01.val[0]),
100
                                         vreinterpretq_u16_u8(row23.val[0]));
101
    const uint16x8x2_t row13 = vtrnq_u16(vreinterpretq_u16_u8(row01.val[1]),
102
                                         vreinterpretq_u16_u8(row23.val[1]));
103
    *p1 = vreinterpretq_u8_u16(row02.val[0]);
104
    *p0 = vreinterpretq_u8_u16(row13.val[0]);
105
    *q0 = vreinterpretq_u8_u16(row02.val[1]);
106
    *q1 = vreinterpretq_u8_u16(row13.val[1]);
107
  }
108
}
109
#undef LOADQ_LANE_32b
110

111
#endif  // !WORK_AROUND_GCC
112

113
static WEBP_INLINE void Load8x16_NEON(
114
    const uint8_t* const src, int stride,
115
    uint8x16_t* const p3, uint8x16_t* const p2, uint8x16_t* const p1,
116
    uint8x16_t* const p0, uint8x16_t* const q0, uint8x16_t* const q1,
117
    uint8x16_t* const q2, uint8x16_t* const q3) {
118
  Load4x16_NEON(src - 2, stride, p3, p2, p1, p0);
119
  Load4x16_NEON(src + 2, stride, q0, q1, q2, q3);
120
}
121

122
static WEBP_INLINE void Load16x4_NEON(const uint8_t* const src, int stride,
123
                                      uint8x16_t* const p1,
124
                                      uint8x16_t* const p0,
125
                                      uint8x16_t* const q0,
126
                                      uint8x16_t* const q1) {
127
  *p1 = vld1q_u8(src - 2 * stride);
128
  *p0 = vld1q_u8(src - 1 * stride);
129
  *q0 = vld1q_u8(src + 0 * stride);
130
  *q1 = vld1q_u8(src + 1 * stride);
131
}
132

133
static WEBP_INLINE void Load16x8_NEON(
134
    const uint8_t* const src, int stride,
135
    uint8x16_t* const p3, uint8x16_t* const p2, uint8x16_t* const p1,
136
    uint8x16_t* const p0, uint8x16_t* const q0, uint8x16_t* const q1,
137
    uint8x16_t* const q2, uint8x16_t* const q3) {
138
  Load16x4_NEON(src - 2  * stride, stride, p3, p2, p1, p0);
139
  Load16x4_NEON(src + 2  * stride, stride, q0, q1, q2, q3);
140
}
141

142
static WEBP_INLINE void Load8x8x2_NEON(
143
    const uint8_t* const u, const uint8_t* const v, int stride,
144
    uint8x16_t* const p3, uint8x16_t* const p2, uint8x16_t* const p1,
145
    uint8x16_t* const p0, uint8x16_t* const q0, uint8x16_t* const q1,
146
    uint8x16_t* const q2, uint8x16_t* const q3) {
147
  // We pack the 8x8 u-samples in the lower half of the uint8x16_t destination
148
  // and the v-samples on the higher half.
149
  *p3 = vcombine_u8(vld1_u8(u - 4 * stride), vld1_u8(v - 4 * stride));
150
  *p2 = vcombine_u8(vld1_u8(u - 3 * stride), vld1_u8(v - 3 * stride));
151
  *p1 = vcombine_u8(vld1_u8(u - 2 * stride), vld1_u8(v - 2 * stride));
152
  *p0 = vcombine_u8(vld1_u8(u - 1 * stride), vld1_u8(v - 1 * stride));
153
  *q0 = vcombine_u8(vld1_u8(u + 0 * stride), vld1_u8(v + 0 * stride));
154
  *q1 = vcombine_u8(vld1_u8(u + 1 * stride), vld1_u8(v + 1 * stride));
155
  *q2 = vcombine_u8(vld1_u8(u + 2 * stride), vld1_u8(v + 2 * stride));
156
  *q3 = vcombine_u8(vld1_u8(u + 3 * stride), vld1_u8(v + 3 * stride));
157
}
158

159
#if !defined(WORK_AROUND_GCC)
160

161
#define LOAD_UV_8(ROW) \
162
  vcombine_u8(vld1_u8(u - 4 + (ROW) * stride), vld1_u8(v - 4 + (ROW) * stride))
163

164
static WEBP_INLINE void Load8x8x2T_NEON(
165
    const uint8_t* const u, const uint8_t* const v, int stride,
166
    uint8x16_t* const p3, uint8x16_t* const p2, uint8x16_t* const p1,
167
    uint8x16_t* const p0, uint8x16_t* const q0, uint8x16_t* const q1,
168
    uint8x16_t* const q2, uint8x16_t* const q3) {
169
  // We pack the 8x8 u-samples in the lower half of the uint8x16_t destination
170
  // and the v-samples on the higher half.
171
  const uint8x16_t row0 = LOAD_UV_8(0);
172
  const uint8x16_t row1 = LOAD_UV_8(1);
173
  const uint8x16_t row2 = LOAD_UV_8(2);
174
  const uint8x16_t row3 = LOAD_UV_8(3);
175
  const uint8x16_t row4 = LOAD_UV_8(4);
176
  const uint8x16_t row5 = LOAD_UV_8(5);
177
  const uint8x16_t row6 = LOAD_UV_8(6);
178
  const uint8x16_t row7 = LOAD_UV_8(7);
179
  // Perform two side-by-side 8x8 transposes
180
  // u00 u01 u02 u03 u04 u05 u06 u07 | v00 v01 v02 v03 v04 v05 v06 v07
181
  // u10 u11 u12 u13 u14 u15 u16 u17 | v10 v11 v12 ...
182
  // u20 u21 u22 u23 u24 u25 u26 u27 | v20 v21 ...
183
  // u30 u31 u32 u33 u34 u35 u36 u37 | ...
184
  // u40 u41 u42 u43 u44 u45 u46 u47 | ...
185
  // u50 u51 u52 u53 u54 u55 u56 u57 | ...
186
  // u60 u61 u62 u63 u64 u65 u66 u67 | v60 ...
187
  // u70 u71 u72 u73 u74 u75 u76 u77 | v70 v71 v72 ...
188
  const uint8x16x2_t row01 = vtrnq_u8(row0, row1);  // u00 u10 u02 u12 ...
189
                                                    // u01 u11 u03 u13 ...
190
  const uint8x16x2_t row23 = vtrnq_u8(row2, row3);  // u20 u30 u22 u32 ...
191
                                                    // u21 u31 u23 u33 ...
192
  const uint8x16x2_t row45 = vtrnq_u8(row4, row5);  // ...
193
  const uint8x16x2_t row67 = vtrnq_u8(row6, row7);  // ...
194
  const uint16x8x2_t row02 = vtrnq_u16(vreinterpretq_u16_u8(row01.val[0]),
195
                                       vreinterpretq_u16_u8(row23.val[0]));
196
  const uint16x8x2_t row13 = vtrnq_u16(vreinterpretq_u16_u8(row01.val[1]),
197
                                       vreinterpretq_u16_u8(row23.val[1]));
198
  const uint16x8x2_t row46 = vtrnq_u16(vreinterpretq_u16_u8(row45.val[0]),
199
                                       vreinterpretq_u16_u8(row67.val[0]));
200
  const uint16x8x2_t row57 = vtrnq_u16(vreinterpretq_u16_u8(row45.val[1]),
201
                                       vreinterpretq_u16_u8(row67.val[1]));
202
  const uint32x4x2_t row04 = vtrnq_u32(vreinterpretq_u32_u16(row02.val[0]),
203
                                       vreinterpretq_u32_u16(row46.val[0]));
204
  const uint32x4x2_t row26 = vtrnq_u32(vreinterpretq_u32_u16(row02.val[1]),
205
                                       vreinterpretq_u32_u16(row46.val[1]));
206
  const uint32x4x2_t row15 = vtrnq_u32(vreinterpretq_u32_u16(row13.val[0]),
207
                                       vreinterpretq_u32_u16(row57.val[0]));
208
  const uint32x4x2_t row37 = vtrnq_u32(vreinterpretq_u32_u16(row13.val[1]),
209
                                       vreinterpretq_u32_u16(row57.val[1]));
210
  *p3 = vreinterpretq_u8_u32(row04.val[0]);
211
  *p2 = vreinterpretq_u8_u32(row15.val[0]);
212
  *p1 = vreinterpretq_u8_u32(row26.val[0]);
213
  *p0 = vreinterpretq_u8_u32(row37.val[0]);
214
  *q0 = vreinterpretq_u8_u32(row04.val[1]);
215
  *q1 = vreinterpretq_u8_u32(row15.val[1]);
216
  *q2 = vreinterpretq_u8_u32(row26.val[1]);
217
  *q3 = vreinterpretq_u8_u32(row37.val[1]);
218
}
219
#undef LOAD_UV_8
220

221
#endif  // !WORK_AROUND_GCC
222

223
static WEBP_INLINE void Store2x8_NEON(const uint8x8x2_t v,
224
                                      uint8_t* const dst, int stride) {
225
  vst2_lane_u8(dst + 0 * stride, v, 0);
226
  vst2_lane_u8(dst + 1 * stride, v, 1);
227
  vst2_lane_u8(dst + 2 * stride, v, 2);
228
  vst2_lane_u8(dst + 3 * stride, v, 3);
229
  vst2_lane_u8(dst + 4 * stride, v, 4);
230
  vst2_lane_u8(dst + 5 * stride, v, 5);
231
  vst2_lane_u8(dst + 6 * stride, v, 6);
232
  vst2_lane_u8(dst + 7 * stride, v, 7);
233
}
234

235
static WEBP_INLINE void Store2x16_NEON(const uint8x16_t p0, const uint8x16_t q0,
236
                                       uint8_t* const dst, int stride) {
237
  uint8x8x2_t lo, hi;
238
  lo.val[0] = vget_low_u8(p0);
239
  lo.val[1] = vget_low_u8(q0);
240
  hi.val[0] = vget_high_u8(p0);
241
  hi.val[1] = vget_high_u8(q0);
242
  Store2x8_NEON(lo, dst - 1 + 0 * stride, stride);
243
  Store2x8_NEON(hi, dst - 1 + 8 * stride, stride);
244
}
245

246
#if !defined(WORK_AROUND_GCC)
247
static WEBP_INLINE void Store4x8_NEON(const uint8x8x4_t v,
248
                                      uint8_t* const dst, int stride) {
249
  vst4_lane_u8(dst + 0 * stride, v, 0);
250
  vst4_lane_u8(dst + 1 * stride, v, 1);
251
  vst4_lane_u8(dst + 2 * stride, v, 2);
252
  vst4_lane_u8(dst + 3 * stride, v, 3);
253
  vst4_lane_u8(dst + 4 * stride, v, 4);
254
  vst4_lane_u8(dst + 5 * stride, v, 5);
255
  vst4_lane_u8(dst + 6 * stride, v, 6);
256
  vst4_lane_u8(dst + 7 * stride, v, 7);
257
}
258

259
static WEBP_INLINE void Store4x16_NEON(const uint8x16_t p1, const uint8x16_t p0,
260
                                       const uint8x16_t q0, const uint8x16_t q1,
261
                                       uint8_t* const dst, int stride) {
262
  uint8x8x4_t lo, hi;
263
  INIT_VECTOR4(lo,
264
               vget_low_u8(p1), vget_low_u8(p0),
265
               vget_low_u8(q0), vget_low_u8(q1));
266
  INIT_VECTOR4(hi,
267
               vget_high_u8(p1), vget_high_u8(p0),
268
               vget_high_u8(q0), vget_high_u8(q1));
269
  Store4x8_NEON(lo, dst - 2 + 0 * stride, stride);
270
  Store4x8_NEON(hi, dst - 2 + 8 * stride, stride);
271
}
272
#endif  // !WORK_AROUND_GCC
273

274
static WEBP_INLINE void Store16x2_NEON(const uint8x16_t p0, const uint8x16_t q0,
275
                                       uint8_t* const dst, int stride) {
276
  vst1q_u8(dst - stride, p0);
277
  vst1q_u8(dst, q0);
278
}
279

280
static WEBP_INLINE void Store16x4_NEON(const uint8x16_t p1, const uint8x16_t p0,
281
                                       const uint8x16_t q0, const uint8x16_t q1,
282
                                       uint8_t* const dst, int stride) {
283
  Store16x2_NEON(p1, p0, dst - stride, stride);
284
  Store16x2_NEON(q0, q1, dst + stride, stride);
285
}
286

287
static WEBP_INLINE void Store8x2x2_NEON(const uint8x16_t p0,
288
                                        const uint8x16_t q0,
289
                                        uint8_t* const u, uint8_t* const v,
290
                                        int stride) {
291
  // p0 and q0 contain the u+v samples packed in low/high halves.
292
  vst1_u8(u - stride, vget_low_u8(p0));
293
  vst1_u8(u,          vget_low_u8(q0));
294
  vst1_u8(v - stride, vget_high_u8(p0));
295
  vst1_u8(v,          vget_high_u8(q0));
296
}
297

298
static WEBP_INLINE void Store8x4x2_NEON(const uint8x16_t p1,
299
                                        const uint8x16_t p0,
300
                                        const uint8x16_t q0,
301
                                        const uint8x16_t q1,
302
                                        uint8_t* const u, uint8_t* const v,
303
                                        int stride) {
304
  // The p1...q1 registers contain the u+v samples packed in low/high halves.
305
  Store8x2x2_NEON(p1, p0, u - stride, v - stride, stride);
306
  Store8x2x2_NEON(q0, q1, u + stride, v + stride, stride);
307
}
308

309
#if !defined(WORK_AROUND_GCC)
310

311
#define STORE6_LANE(DST, VAL0, VAL1, LANE) do {   \
312
  vst3_lane_u8((DST) - 3, (VAL0), (LANE));        \
313
  vst3_lane_u8((DST) + 0, (VAL1), (LANE));        \
314
  (DST) += stride;                                \
315
} while (0)
316

317
static WEBP_INLINE void Store6x8x2_NEON(
318
    const uint8x16_t p2, const uint8x16_t p1, const uint8x16_t p0,
319
    const uint8x16_t q0, const uint8x16_t q1, const uint8x16_t q2,
320
    uint8_t* u, uint8_t* v, int stride) {
321
  uint8x8x3_t u0, u1, v0, v1;
322
  INIT_VECTOR3(u0, vget_low_u8(p2), vget_low_u8(p1), vget_low_u8(p0));
323
  INIT_VECTOR3(u1, vget_low_u8(q0), vget_low_u8(q1), vget_low_u8(q2));
324
  INIT_VECTOR3(v0, vget_high_u8(p2), vget_high_u8(p1), vget_high_u8(p0));
325
  INIT_VECTOR3(v1, vget_high_u8(q0), vget_high_u8(q1), vget_high_u8(q2));
326
  STORE6_LANE(u, u0, u1, 0);
327
  STORE6_LANE(u, u0, u1, 1);
328
  STORE6_LANE(u, u0, u1, 2);
329
  STORE6_LANE(u, u0, u1, 3);
330
  STORE6_LANE(u, u0, u1, 4);
331
  STORE6_LANE(u, u0, u1, 5);
332
  STORE6_LANE(u, u0, u1, 6);
333
  STORE6_LANE(u, u0, u1, 7);
334
  STORE6_LANE(v, v0, v1, 0);
335
  STORE6_LANE(v, v0, v1, 1);
336
  STORE6_LANE(v, v0, v1, 2);
337
  STORE6_LANE(v, v0, v1, 3);
338
  STORE6_LANE(v, v0, v1, 4);
339
  STORE6_LANE(v, v0, v1, 5);
340
  STORE6_LANE(v, v0, v1, 6);
341
  STORE6_LANE(v, v0, v1, 7);
342
}
343
#undef STORE6_LANE
344

345
static WEBP_INLINE void Store4x8x2_NEON(const uint8x16_t p1,
346
                                        const uint8x16_t p0,
347
                                        const uint8x16_t q0,
348
                                        const uint8x16_t q1,
349
                                        uint8_t* const u, uint8_t* const v,
350
                                        int stride) {
351
  uint8x8x4_t u0, v0;
352
  INIT_VECTOR4(u0,
353
               vget_low_u8(p1), vget_low_u8(p0),
354
               vget_low_u8(q0), vget_low_u8(q1));
355
  INIT_VECTOR4(v0,
356
               vget_high_u8(p1), vget_high_u8(p0),
357
               vget_high_u8(q0), vget_high_u8(q1));
358
  vst4_lane_u8(u - 2 + 0 * stride, u0, 0);
359
  vst4_lane_u8(u - 2 + 1 * stride, u0, 1);
360
  vst4_lane_u8(u - 2 + 2 * stride, u0, 2);
361
  vst4_lane_u8(u - 2 + 3 * stride, u0, 3);
362
  vst4_lane_u8(u - 2 + 4 * stride, u0, 4);
363
  vst4_lane_u8(u - 2 + 5 * stride, u0, 5);
364
  vst4_lane_u8(u - 2 + 6 * stride, u0, 6);
365
  vst4_lane_u8(u - 2 + 7 * stride, u0, 7);
366
  vst4_lane_u8(v - 2 + 0 * stride, v0, 0);
367
  vst4_lane_u8(v - 2 + 1 * stride, v0, 1);
368
  vst4_lane_u8(v - 2 + 2 * stride, v0, 2);
369
  vst4_lane_u8(v - 2 + 3 * stride, v0, 3);
370
  vst4_lane_u8(v - 2 + 4 * stride, v0, 4);
371
  vst4_lane_u8(v - 2 + 5 * stride, v0, 5);
372
  vst4_lane_u8(v - 2 + 6 * stride, v0, 6);
373
  vst4_lane_u8(v - 2 + 7 * stride, v0, 7);
374
}
375

376
#endif  // !WORK_AROUND_GCC
377

378
// Zero extend 'v' to an int16x8_t.
379
static WEBP_INLINE int16x8_t ConvertU8ToS16_NEON(uint8x8_t v) {
380
  return vreinterpretq_s16_u16(vmovl_u8(v));
381
}
382

383
// Performs unsigned 8b saturation on 'dst01' and 'dst23' storing the result
384
// to the corresponding rows of 'dst'.
385
static WEBP_INLINE void SaturateAndStore4x4_NEON(uint8_t* const dst,
386
                                                 const int16x8_t dst01,
387
                                                 const int16x8_t dst23) {
388
  // Unsigned saturate to 8b.
389
  const uint8x8_t dst01_u8 = vqmovun_s16(dst01);
390
  const uint8x8_t dst23_u8 = vqmovun_s16(dst23);
391

392
  // Store the results.
393
  vst1_lane_u32((uint32_t*)(dst + 0 * BPS), vreinterpret_u32_u8(dst01_u8), 0);
394
  vst1_lane_u32((uint32_t*)(dst + 1 * BPS), vreinterpret_u32_u8(dst01_u8), 1);
395
  vst1_lane_u32((uint32_t*)(dst + 2 * BPS), vreinterpret_u32_u8(dst23_u8), 0);
396
  vst1_lane_u32((uint32_t*)(dst + 3 * BPS), vreinterpret_u32_u8(dst23_u8), 1);
397
}
398

399
static WEBP_INLINE void Add4x4_NEON(const int16x8_t row01,
400
                                    const int16x8_t row23,
401
                                    uint8_t* const dst) {
402
  uint32x2_t dst01 = vdup_n_u32(0);
403
  uint32x2_t dst23 = vdup_n_u32(0);
404

405
  // Load the source pixels.
406
  dst01 = vld1_lane_u32((uint32_t*)(dst + 0 * BPS), dst01, 0);
407
  dst23 = vld1_lane_u32((uint32_t*)(dst + 2 * BPS), dst23, 0);
408
  dst01 = vld1_lane_u32((uint32_t*)(dst + 1 * BPS), dst01, 1);
409
  dst23 = vld1_lane_u32((uint32_t*)(dst + 3 * BPS), dst23, 1);
410

411
  {
412
    // Convert to 16b.
413
    const int16x8_t dst01_s16 = ConvertU8ToS16_NEON(vreinterpret_u8_u32(dst01));
414
    const int16x8_t dst23_s16 = ConvertU8ToS16_NEON(vreinterpret_u8_u32(dst23));
415

416
    // Descale with rounding.
417
    const int16x8_t out01 = vrsraq_n_s16(dst01_s16, row01, 3);
418
    const int16x8_t out23 = vrsraq_n_s16(dst23_s16, row23, 3);
419
    // Add the inverse transform.
420
    SaturateAndStore4x4_NEON(dst, out01, out23);
421
  }
422
}
423

424
//-----------------------------------------------------------------------------
425
// Simple In-loop filtering (Paragraph 15.2)
426

427
static uint8x16_t NeedsFilter_NEON(const uint8x16_t p1, const uint8x16_t p0,
428
                                   const uint8x16_t q0, const uint8x16_t q1,
429
                                   int thresh) {
430
  const uint8x16_t thresh_v = vdupq_n_u8((uint8_t)thresh);
431
  const uint8x16_t a_p0_q0 = vabdq_u8(p0, q0);               // abs(p0-q0)
432
  const uint8x16_t a_p1_q1 = vabdq_u8(p1, q1);               // abs(p1-q1)
433
  const uint8x16_t a_p0_q0_2 = vqaddq_u8(a_p0_q0, a_p0_q0);  // 2 * abs(p0-q0)
434
  const uint8x16_t a_p1_q1_2 = vshrq_n_u8(a_p1_q1, 1);       // abs(p1-q1) / 2
435
  const uint8x16_t sum = vqaddq_u8(a_p0_q0_2, a_p1_q1_2);
436
  const uint8x16_t mask = vcgeq_u8(thresh_v, sum);
437
  return mask;
438
}
439

440
static int8x16_t FlipSign_NEON(const uint8x16_t v) {
441
  const uint8x16_t sign_bit = vdupq_n_u8(0x80);
442
  return vreinterpretq_s8_u8(veorq_u8(v, sign_bit));
443
}
444

445
static uint8x16_t FlipSignBack_NEON(const int8x16_t v) {
446
  const int8x16_t sign_bit = vdupq_n_s8(0x80);
447
  return vreinterpretq_u8_s8(veorq_s8(v, sign_bit));
448
}
449

450
static int8x16_t GetBaseDelta_NEON(const int8x16_t p1, const int8x16_t p0,
451
                                   const int8x16_t q0, const int8x16_t q1) {
452
  const int8x16_t q0_p0 = vqsubq_s8(q0, p0);      // (q0-p0)
453
  const int8x16_t p1_q1 = vqsubq_s8(p1, q1);      // (p1-q1)
454
  const int8x16_t s1 = vqaddq_s8(p1_q1, q0_p0);   // (p1-q1) + 1 * (q0 - p0)
455
  const int8x16_t s2 = vqaddq_s8(q0_p0, s1);      // (p1-q1) + 2 * (q0 - p0)
456
  const int8x16_t s3 = vqaddq_s8(q0_p0, s2);      // (p1-q1) + 3 * (q0 - p0)
457
  return s3;
458
}
459

460
static int8x16_t GetBaseDelta0_NEON(const int8x16_t p0, const int8x16_t q0) {
461
  const int8x16_t q0_p0 = vqsubq_s8(q0, p0);      // (q0-p0)
462
  const int8x16_t s1 = vqaddq_s8(q0_p0, q0_p0);   // 2 * (q0 - p0)
463
  const int8x16_t s2 = vqaddq_s8(q0_p0, s1);      // 3 * (q0 - p0)
464
  return s2;
465
}
466

467
//------------------------------------------------------------------------------
468

469
static void ApplyFilter2NoFlip_NEON(const int8x16_t p0s, const int8x16_t q0s,
470
                                    const int8x16_t delta,
471
                                    int8x16_t* const op0,
472
                                    int8x16_t* const oq0) {
473
  const int8x16_t kCst3 = vdupq_n_s8(0x03);
474
  const int8x16_t kCst4 = vdupq_n_s8(0x04);
475
  const int8x16_t delta_p3 = vqaddq_s8(delta, kCst3);
476
  const int8x16_t delta_p4 = vqaddq_s8(delta, kCst4);
477
  const int8x16_t delta3 = vshrq_n_s8(delta_p3, 3);
478
  const int8x16_t delta4 = vshrq_n_s8(delta_p4, 3);
479
  *op0 = vqaddq_s8(p0s, delta3);
480
  *oq0 = vqsubq_s8(q0s, delta4);
481
}
482

483
#if defined(WEBP_USE_INTRINSICS)
484

485
static void ApplyFilter2_NEON(const int8x16_t p0s, const int8x16_t q0s,
486
                              const int8x16_t delta,
487
                              uint8x16_t* const op0, uint8x16_t* const oq0) {
488
  const int8x16_t kCst3 = vdupq_n_s8(0x03);
489
  const int8x16_t kCst4 = vdupq_n_s8(0x04);
490
  const int8x16_t delta_p3 = vqaddq_s8(delta, kCst3);
491
  const int8x16_t delta_p4 = vqaddq_s8(delta, kCst4);
492
  const int8x16_t delta3 = vshrq_n_s8(delta_p3, 3);
493
  const int8x16_t delta4 = vshrq_n_s8(delta_p4, 3);
494
  const int8x16_t sp0 = vqaddq_s8(p0s, delta3);
495
  const int8x16_t sq0 = vqsubq_s8(q0s, delta4);
496
  *op0 = FlipSignBack_NEON(sp0);
497
  *oq0 = FlipSignBack_NEON(sq0);
498
}
499

500
static void DoFilter2_NEON(const uint8x16_t p1, const uint8x16_t p0,
501
                           const uint8x16_t q0, const uint8x16_t q1,
502
                           const uint8x16_t mask,
503
                           uint8x16_t* const op0, uint8x16_t* const oq0) {
504
  const int8x16_t p1s = FlipSign_NEON(p1);
505
  const int8x16_t p0s = FlipSign_NEON(p0);
506
  const int8x16_t q0s = FlipSign_NEON(q0);
507
  const int8x16_t q1s = FlipSign_NEON(q1);
508
  const int8x16_t delta0 = GetBaseDelta_NEON(p1s, p0s, q0s, q1s);
509
  const int8x16_t delta1 = vandq_s8(delta0, vreinterpretq_s8_u8(mask));
510
  ApplyFilter2_NEON(p0s, q0s, delta1, op0, oq0);
511
}
512

513
static void SimpleVFilter16_NEON(uint8_t* p, int stride, int thresh) {
514
  uint8x16_t p1, p0, q0, q1, op0, oq0;
515
  Load16x4_NEON(p, stride, &p1, &p0, &q0, &q1);
516
  {
517
    const uint8x16_t mask = NeedsFilter_NEON(p1, p0, q0, q1, thresh);
518
    DoFilter2_NEON(p1, p0, q0, q1, mask, &op0, &oq0);
519
  }
520
  Store16x2_NEON(op0, oq0, p, stride);
521
}
522

523
static void SimpleHFilter16_NEON(uint8_t* p, int stride, int thresh) {
524
  uint8x16_t p1, p0, q0, q1, oq0, op0;
525
  Load4x16_NEON(p, stride, &p1, &p0, &q0, &q1);
526
  {
527
    const uint8x16_t mask = NeedsFilter_NEON(p1, p0, q0, q1, thresh);
528
    DoFilter2_NEON(p1, p0, q0, q1, mask, &op0, &oq0);
529
  }
530
  Store2x16_NEON(op0, oq0, p, stride);
531
}
532

533
#else
534

535
// Load/Store vertical edge
536
#define LOAD8x4(c1, c2, c3, c4, b1, b2, stride)                                \
537
  "vld4.8 {" #c1 "[0]," #c2 "[0]," #c3 "[0]," #c4 "[0]}," #b1 "," #stride "\n" \
538
  "vld4.8 {" #c1 "[1]," #c2 "[1]," #c3 "[1]," #c4 "[1]}," #b2 "," #stride "\n" \
539
  "vld4.8 {" #c1 "[2]," #c2 "[2]," #c3 "[2]," #c4 "[2]}," #b1 "," #stride "\n" \
540
  "vld4.8 {" #c1 "[3]," #c2 "[3]," #c3 "[3]," #c4 "[3]}," #b2 "," #stride "\n" \
541
  "vld4.8 {" #c1 "[4]," #c2 "[4]," #c3 "[4]," #c4 "[4]}," #b1 "," #stride "\n" \
542
  "vld4.8 {" #c1 "[5]," #c2 "[5]," #c3 "[5]," #c4 "[5]}," #b2 "," #stride "\n" \
543
  "vld4.8 {" #c1 "[6]," #c2 "[6]," #c3 "[6]," #c4 "[6]}," #b1 "," #stride "\n" \
544
  "vld4.8 {" #c1 "[7]," #c2 "[7]," #c3 "[7]," #c4 "[7]}," #b2 "," #stride "\n"
545

546
#define STORE8x2(c1, c2, p, stride)                                            \
547
  "vst2.8   {" #c1 "[0], " #c2 "[0]}," #p "," #stride " \n"                    \
548
  "vst2.8   {" #c1 "[1], " #c2 "[1]}," #p "," #stride " \n"                    \
549
  "vst2.8   {" #c1 "[2], " #c2 "[2]}," #p "," #stride " \n"                    \
550
  "vst2.8   {" #c1 "[3], " #c2 "[3]}," #p "," #stride " \n"                    \
551
  "vst2.8   {" #c1 "[4], " #c2 "[4]}," #p "," #stride " \n"                    \
552
  "vst2.8   {" #c1 "[5], " #c2 "[5]}," #p "," #stride " \n"                    \
553
  "vst2.8   {" #c1 "[6], " #c2 "[6]}," #p "," #stride " \n"                    \
554
  "vst2.8   {" #c1 "[7], " #c2 "[7]}," #p "," #stride " \n"
555

556
#define QRegs "q0", "q1", "q2", "q3",                                          \
557
              "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
558

559
#define FLIP_SIGN_BIT2(a, b, s)                                                \
560
  "veor     " #a "," #a "," #s "               \n"                             \
561
  "veor     " #b "," #b "," #s "               \n"                             \
562

563
#define FLIP_SIGN_BIT4(a, b, c, d, s)                                          \
564
  FLIP_SIGN_BIT2(a, b, s)                                                      \
565
  FLIP_SIGN_BIT2(c, d, s)                                                      \
566

567
#define NEEDS_FILTER(p1, p0, q0, q1, thresh, mask)                             \
568
  "vabd.u8    q15," #p0 "," #q0 "         \n"  /* abs(p0 - q0) */              \
569
  "vabd.u8    q14," #p1 "," #q1 "         \n"  /* abs(p1 - q1) */              \
570
  "vqadd.u8   q15, q15, q15               \n"  /* abs(p0 - q0) * 2 */          \
571
  "vshr.u8    q14, q14, #1                \n"  /* abs(p1 - q1) / 2 */          \
572
  "vqadd.u8   q15, q15, q14     \n"  /* abs(p0 - q0) * 2 + abs(p1 - q1) / 2 */ \
573
  "vdup.8     q14, " #thresh "            \n"                                  \
574
  "vcge.u8   " #mask ", q14, q15          \n"  /* mask <= thresh */
575

576
#define GET_BASE_DELTA(p1, p0, q0, q1, o)                                      \
577
  "vqsub.s8   q15," #q0 "," #p0 "         \n"  /* (q0 - p0) */                 \
578
  "vqsub.s8  " #o "," #p1 "," #q1 "       \n"  /* (p1 - q1) */                 \
579
  "vqadd.s8  " #o "," #o ", q15           \n"  /* (p1 - q1) + 1 * (p0 - q0) */ \
580
  "vqadd.s8  " #o "," #o ", q15           \n"  /* (p1 - q1) + 2 * (p0 - q0) */ \
581
  "vqadd.s8  " #o "," #o ", q15           \n"  /* (p1 - q1) + 3 * (p0 - q0) */
582

583
#define DO_SIMPLE_FILTER(p0, q0, fl)                                           \
584
  "vmov.i8    q15, #0x03                  \n"                                  \
585
  "vqadd.s8   q15, q15, " #fl "           \n"  /* filter1 = filter + 3 */      \
586
  "vshr.s8    q15, q15, #3                \n"  /* filter1 >> 3 */              \
587
  "vqadd.s8  " #p0 "," #p0 ", q15         \n"  /* p0 += filter1 */             \
588
                                                                               \
589
  "vmov.i8    q15, #0x04                  \n"                                  \
590
  "vqadd.s8   q15, q15, " #fl "           \n"  /* filter1 = filter + 4 */      \
591
  "vshr.s8    q15, q15, #3                \n"  /* filter2 >> 3 */              \
592
  "vqsub.s8  " #q0 "," #q0 ", q15         \n"  /* q0 -= filter2 */
593

594
// Applies filter on 2 pixels (p0 and q0)
595
#define DO_FILTER2(p1, p0, q0, q1, thresh)                                     \
596
  NEEDS_FILTER(p1, p0, q0, q1, thresh, q9)     /* filter mask in q9 */         \
597
  "vmov.i8    q10, #0x80                  \n"  /* sign bit */                  \
598
  FLIP_SIGN_BIT4(p1, p0, q0, q1, q10)          /* convert to signed value */   \
599
  GET_BASE_DELTA(p1, p0, q0, q1, q11)          /* get filter level  */         \
600
  "vand       q9, q9, q11                 \n"  /* apply filter mask */         \
601
  DO_SIMPLE_FILTER(p0, q0, q9)                 /* apply filter */              \
602
  FLIP_SIGN_BIT2(p0, q0, q10)
603

604
static void SimpleVFilter16_NEON(uint8_t* p, int stride, int thresh) {
605
  __asm__ volatile (
606
    "sub        %[p], %[p], %[stride], lsl #1  \n"  // p -= 2 * stride
607

608
    "vld1.u8    {q1}, [%[p]], %[stride]        \n"  // p1
609
    "vld1.u8    {q2}, [%[p]], %[stride]        \n"  // p0
610
    "vld1.u8    {q3}, [%[p]], %[stride]        \n"  // q0
611
    "vld1.u8    {q12}, [%[p]]                  \n"  // q1
612

613
    DO_FILTER2(q1, q2, q3, q12, %[thresh])
614

615
    "sub        %[p], %[p], %[stride], lsl #1  \n"  // p -= 2 * stride
616

617
    "vst1.u8    {q2}, [%[p]], %[stride]        \n"  // store op0
618
    "vst1.u8    {q3}, [%[p]]                   \n"  // store oq0
619
    : [p] "+r"(p)
620
    : [stride] "r"(stride), [thresh] "r"(thresh)
621
    : "memory", QRegs
622
  );
623
}
624

625
static void SimpleHFilter16_NEON(uint8_t* p, int stride, int thresh) {
626
  __asm__ volatile (
627
    "sub        r4, %[p], #2                   \n"  // base1 = p - 2
628
    "lsl        r6, %[stride], #1              \n"  // r6 = 2 * stride
629
    "add        r5, r4, %[stride]              \n"  // base2 = base1 + stride
630

631
    LOAD8x4(d2, d3, d4, d5, [r4], [r5], r6)
632
    LOAD8x4(d24, d25, d26, d27, [r4], [r5], r6)
633
    "vswp       d3, d24                        \n"  // p1:q1 p0:q3
634
    "vswp       d5, d26                        \n"  // q0:q2 q1:q4
635
    "vswp       q2, q12                        \n"  // p1:q1 p0:q2 q0:q3 q1:q4
636

637
    DO_FILTER2(q1, q2, q12, q13, %[thresh])
638

639
    "sub        %[p], %[p], #1                 \n"  // p - 1
640

641
    "vswp        d5, d24                       \n"
642
    STORE8x2(d4, d5, [%[p]], %[stride])
643
    STORE8x2(d24, d25, [%[p]], %[stride])
644

645
    : [p] "+r"(p)
646
    : [stride] "r"(stride), [thresh] "r"(thresh)
647
    : "memory", "r4", "r5", "r6", QRegs
648
  );
649
}
650

651
#undef LOAD8x4
652
#undef STORE8x2
653

654
#endif    // WEBP_USE_INTRINSICS
655

656
static void SimpleVFilter16i_NEON(uint8_t* p, int stride, int thresh) {
657
  uint32_t k;
658
  for (k = 3; k != 0; --k) {
659
    p += 4 * stride;
660
    SimpleVFilter16_NEON(p, stride, thresh);
661
  }
662
}
663

664
static void SimpleHFilter16i_NEON(uint8_t* p, int stride, int thresh) {
665
  uint32_t k;
666
  for (k = 3; k != 0; --k) {
667
    p += 4;
668
    SimpleHFilter16_NEON(p, stride, thresh);
669
  }
670
}
671

672
//------------------------------------------------------------------------------
673
// Complex In-loop filtering (Paragraph 15.3)
674

675
static uint8x16_t NeedsHev_NEON(const uint8x16_t p1, const uint8x16_t p0,
676
                                const uint8x16_t q0, const uint8x16_t q1,
677
                                int hev_thresh) {
678
  const uint8x16_t hev_thresh_v = vdupq_n_u8((uint8_t)hev_thresh);
679
  const uint8x16_t a_p1_p0 = vabdq_u8(p1, p0);  // abs(p1 - p0)
680
  const uint8x16_t a_q1_q0 = vabdq_u8(q1, q0);  // abs(q1 - q0)
681
  const uint8x16_t a_max = vmaxq_u8(a_p1_p0, a_q1_q0);
682
  const uint8x16_t mask = vcgtq_u8(a_max, hev_thresh_v);
683
  return mask;
684
}
685

686
static uint8x16_t NeedsFilter2_NEON(const uint8x16_t p3, const uint8x16_t p2,
687
                                    const uint8x16_t p1, const uint8x16_t p0,
688
                                    const uint8x16_t q0, const uint8x16_t q1,
689
                                    const uint8x16_t q2, const uint8x16_t q3,
690
                                    int ithresh, int thresh) {
691
  const uint8x16_t ithresh_v = vdupq_n_u8((uint8_t)ithresh);
692
  const uint8x16_t a_p3_p2 = vabdq_u8(p3, p2);  // abs(p3 - p2)
693
  const uint8x16_t a_p2_p1 = vabdq_u8(p2, p1);  // abs(p2 - p1)
694
  const uint8x16_t a_p1_p0 = vabdq_u8(p1, p0);  // abs(p1 - p0)
695
  const uint8x16_t a_q3_q2 = vabdq_u8(q3, q2);  // abs(q3 - q2)
696
  const uint8x16_t a_q2_q1 = vabdq_u8(q2, q1);  // abs(q2 - q1)
697
  const uint8x16_t a_q1_q0 = vabdq_u8(q1, q0);  // abs(q1 - q0)
698
  const uint8x16_t max1 = vmaxq_u8(a_p3_p2, a_p2_p1);
699
  const uint8x16_t max2 = vmaxq_u8(a_p1_p0, a_q3_q2);
700
  const uint8x16_t max3 = vmaxq_u8(a_q2_q1, a_q1_q0);
701
  const uint8x16_t max12 = vmaxq_u8(max1, max2);
702
  const uint8x16_t max123 = vmaxq_u8(max12, max3);
703
  const uint8x16_t mask2 = vcgeq_u8(ithresh_v, max123);
704
  const uint8x16_t mask1 = NeedsFilter_NEON(p1, p0, q0, q1, thresh);
705
  const uint8x16_t mask = vandq_u8(mask1, mask2);
706
  return mask;
707
}
708

709
//  4-points filter
710

711
static void ApplyFilter4_NEON(
712
    const int8x16_t p1, const int8x16_t p0,
713
    const int8x16_t q0, const int8x16_t q1,
714
    const int8x16_t delta0,
715
    uint8x16_t* const op1, uint8x16_t* const op0,
716
    uint8x16_t* const oq0, uint8x16_t* const oq1) {
717
  const int8x16_t kCst3 = vdupq_n_s8(0x03);
718
  const int8x16_t kCst4 = vdupq_n_s8(0x04);
719
  const int8x16_t delta1 = vqaddq_s8(delta0, kCst4);
720
  const int8x16_t delta2 = vqaddq_s8(delta0, kCst3);
721
  const int8x16_t a1 = vshrq_n_s8(delta1, 3);
722
  const int8x16_t a2 = vshrq_n_s8(delta2, 3);
723
  const int8x16_t a3 = vrshrq_n_s8(a1, 1);   // a3 = (a1 + 1) >> 1
724
  *op0 = FlipSignBack_NEON(vqaddq_s8(p0, a2));  // clip(p0 + a2)
725
  *oq0 = FlipSignBack_NEON(vqsubq_s8(q0, a1));  // clip(q0 - a1)
726
  *op1 = FlipSignBack_NEON(vqaddq_s8(p1, a3));  // clip(p1 + a3)
727
  *oq1 = FlipSignBack_NEON(vqsubq_s8(q1, a3));  // clip(q1 - a3)
728
}
729

730
static void DoFilter4_NEON(
731
    const uint8x16_t p1, const uint8x16_t p0,
732
    const uint8x16_t q0, const uint8x16_t q1,
733
    const uint8x16_t mask, const uint8x16_t hev_mask,
734
    uint8x16_t* const op1, uint8x16_t* const op0,
735
    uint8x16_t* const oq0, uint8x16_t* const oq1) {
736
  // This is a fused version of DoFilter2() calling ApplyFilter2 directly
737
  const int8x16_t p1s = FlipSign_NEON(p1);
738
  int8x16_t p0s = FlipSign_NEON(p0);
739
  int8x16_t q0s = FlipSign_NEON(q0);
740
  const int8x16_t q1s = FlipSign_NEON(q1);
741
  const uint8x16_t simple_lf_mask = vandq_u8(mask, hev_mask);
742

743
  // do_filter2 part (simple loopfilter on pixels with hev)
744
  {
745
    const int8x16_t delta = GetBaseDelta_NEON(p1s, p0s, q0s, q1s);
746
    const int8x16_t simple_lf_delta =
747
        vandq_s8(delta, vreinterpretq_s8_u8(simple_lf_mask));
748
    ApplyFilter2NoFlip_NEON(p0s, q0s, simple_lf_delta, &p0s, &q0s);
749
  }
750

751
  // do_filter4 part (complex loopfilter on pixels without hev)
752
  {
753
    const int8x16_t delta0 = GetBaseDelta0_NEON(p0s, q0s);
754
    // we use: (mask & hev_mask) ^ mask = mask & !hev_mask
755
    const uint8x16_t complex_lf_mask = veorq_u8(simple_lf_mask, mask);
756
    const int8x16_t complex_lf_delta =
757
        vandq_s8(delta0, vreinterpretq_s8_u8(complex_lf_mask));
758
    ApplyFilter4_NEON(p1s, p0s, q0s, q1s, complex_lf_delta, op1, op0, oq0, oq1);
759
  }
760
}
761

762
//  6-points filter
763

764
static void ApplyFilter6_NEON(
765
    const int8x16_t p2, const int8x16_t p1, const int8x16_t p0,
766
    const int8x16_t q0, const int8x16_t q1, const int8x16_t q2,
767
    const int8x16_t delta,
768
    uint8x16_t* const op2, uint8x16_t* const op1, uint8x16_t* const op0,
769
    uint8x16_t* const oq0, uint8x16_t* const oq1, uint8x16_t* const oq2) {
770
  // We have to compute: X = (9*a+63) >> 7, Y = (18*a+63)>>7, Z = (27*a+63) >> 7
771
  // Turns out, there's a common sub-expression S=9 * a - 1 that can be used
772
  // with the special vqrshrn_n_s16 rounding-shift-and-narrow instruction:
773
  //   X = (S + 64) >> 7, Y = (S + 32) >> 6, Z = (18 * a + S + 64) >> 7
774
  const int8x8_t delta_lo = vget_low_s8(delta);
775
  const int8x8_t delta_hi = vget_high_s8(delta);
776
  const int8x8_t kCst9 = vdup_n_s8(9);
777
  const int16x8_t kCstm1 = vdupq_n_s16(-1);
778
  const int8x8_t kCst18 = vdup_n_s8(18);
779
  const int16x8_t S_lo = vmlal_s8(kCstm1, kCst9, delta_lo);  // S = 9 * a - 1
780
  const int16x8_t S_hi = vmlal_s8(kCstm1, kCst9, delta_hi);
781
  const int16x8_t Z_lo = vmlal_s8(S_lo, kCst18, delta_lo);   // S + 18 * a
782
  const int16x8_t Z_hi = vmlal_s8(S_hi, kCst18, delta_hi);
783
  const int8x8_t a3_lo = vqrshrn_n_s16(S_lo, 7);   // (9 * a + 63) >> 7
784
  const int8x8_t a3_hi = vqrshrn_n_s16(S_hi, 7);
785
  const int8x8_t a2_lo = vqrshrn_n_s16(S_lo, 6);   // (9 * a + 31) >> 6
786
  const int8x8_t a2_hi = vqrshrn_n_s16(S_hi, 6);
787
  const int8x8_t a1_lo = vqrshrn_n_s16(Z_lo, 7);   // (27 * a + 63) >> 7
788
  const int8x8_t a1_hi = vqrshrn_n_s16(Z_hi, 7);
789
  const int8x16_t a1 = vcombine_s8(a1_lo, a1_hi);
790
  const int8x16_t a2 = vcombine_s8(a2_lo, a2_hi);
791
  const int8x16_t a3 = vcombine_s8(a3_lo, a3_hi);
792

793
  *op0 = FlipSignBack_NEON(vqaddq_s8(p0, a1));  // clip(p0 + a1)
794
  *oq0 = FlipSignBack_NEON(vqsubq_s8(q0, a1));  // clip(q0 - q1)
795
  *oq1 = FlipSignBack_NEON(vqsubq_s8(q1, a2));  // clip(q1 - a2)
796
  *op1 = FlipSignBack_NEON(vqaddq_s8(p1, a2));  // clip(p1 + a2)
797
  *oq2 = FlipSignBack_NEON(vqsubq_s8(q2, a3));  // clip(q2 - a3)
798
  *op2 = FlipSignBack_NEON(vqaddq_s8(p2, a3));  // clip(p2 + a3)
799
}
800

801
static void DoFilter6_NEON(
802
    const uint8x16_t p2, const uint8x16_t p1, const uint8x16_t p0,
803
    const uint8x16_t q0, const uint8x16_t q1, const uint8x16_t q2,
804
    const uint8x16_t mask, const uint8x16_t hev_mask,
805
    uint8x16_t* const op2, uint8x16_t* const op1, uint8x16_t* const op0,
806
    uint8x16_t* const oq0, uint8x16_t* const oq1, uint8x16_t* const oq2) {
807
  // This is a fused version of DoFilter2() calling ApplyFilter2 directly
808
  const int8x16_t p2s = FlipSign_NEON(p2);
809
  const int8x16_t p1s = FlipSign_NEON(p1);
810
  int8x16_t p0s = FlipSign_NEON(p0);
811
  int8x16_t q0s = FlipSign_NEON(q0);
812
  const int8x16_t q1s = FlipSign_NEON(q1);
813
  const int8x16_t q2s = FlipSign_NEON(q2);
814
  const uint8x16_t simple_lf_mask = vandq_u8(mask, hev_mask);
815
  const int8x16_t delta0 = GetBaseDelta_NEON(p1s, p0s, q0s, q1s);
816

817
  // do_filter2 part (simple loopfilter on pixels with hev)
818
  {
819
    const int8x16_t simple_lf_delta =
820
        vandq_s8(delta0, vreinterpretq_s8_u8(simple_lf_mask));
821
    ApplyFilter2NoFlip_NEON(p0s, q0s, simple_lf_delta, &p0s, &q0s);
822
  }
823

824
  // do_filter6 part (complex loopfilter on pixels without hev)
825
  {
826
    // we use: (mask & hev_mask) ^ mask = mask & !hev_mask
827
    const uint8x16_t complex_lf_mask = veorq_u8(simple_lf_mask, mask);
828
    const int8x16_t complex_lf_delta =
829
        vandq_s8(delta0, vreinterpretq_s8_u8(complex_lf_mask));
830
    ApplyFilter6_NEON(p2s, p1s, p0s, q0s, q1s, q2s, complex_lf_delta,
831
                      op2, op1, op0, oq0, oq1, oq2);
832
  }
833
}
834

835
// on macroblock edges
836

837
static void VFilter16_NEON(uint8_t* p, int stride,
838
                           int thresh, int ithresh, int hev_thresh) {
839
  uint8x16_t p3, p2, p1, p0, q0, q1, q2, q3;
840
  Load16x8_NEON(p, stride, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
841
  {
842
    const uint8x16_t mask = NeedsFilter2_NEON(p3, p2, p1, p0, q0, q1, q2, q3,
843
                                              ithresh, thresh);
844
    const uint8x16_t hev_mask = NeedsHev_NEON(p1, p0, q0, q1, hev_thresh);
845
    uint8x16_t op2, op1, op0, oq0, oq1, oq2;
846
    DoFilter6_NEON(p2, p1, p0, q0, q1, q2, mask, hev_mask,
847
                   &op2, &op1, &op0, &oq0, &oq1, &oq2);
848
    Store16x2_NEON(op2, op1, p - 2 * stride, stride);
849
    Store16x2_NEON(op0, oq0, p + 0 * stride, stride);
850
    Store16x2_NEON(oq1, oq2, p + 2 * stride, stride);
851
  }
852
}
853

854
static void HFilter16_NEON(uint8_t* p, int stride,
855
                           int thresh, int ithresh, int hev_thresh) {
856
  uint8x16_t p3, p2, p1, p0, q0, q1, q2, q3;
857
  Load8x16_NEON(p, stride, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
858
  {
859
    const uint8x16_t mask = NeedsFilter2_NEON(p3, p2, p1, p0, q0, q1, q2, q3,
860
                                              ithresh, thresh);
861
    const uint8x16_t hev_mask = NeedsHev_NEON(p1, p0, q0, q1, hev_thresh);
862
    uint8x16_t op2, op1, op0, oq0, oq1, oq2;
863
    DoFilter6_NEON(p2, p1, p0, q0, q1, q2, mask, hev_mask,
864
                   &op2, &op1, &op0, &oq0, &oq1, &oq2);
865
    Store2x16_NEON(op2, op1, p - 2, stride);
866
    Store2x16_NEON(op0, oq0, p + 0, stride);
867
    Store2x16_NEON(oq1, oq2, p + 2, stride);
868
  }
869
}
870

871
// on three inner edges
872
static void VFilter16i_NEON(uint8_t* p, int stride,
873
                            int thresh, int ithresh, int hev_thresh) {
874
  uint32_t k;
875
  uint8x16_t p3, p2, p1, p0;
876
  Load16x4_NEON(p + 2  * stride, stride, &p3, &p2, &p1, &p0);
877
  for (k = 3; k != 0; --k) {
878
    uint8x16_t q0, q1, q2, q3;
879
    p += 4 * stride;
880
    Load16x4_NEON(p + 2  * stride, stride, &q0, &q1, &q2, &q3);
881
    {
882
      const uint8x16_t mask =
883
          NeedsFilter2_NEON(p3, p2, p1, p0, q0, q1, q2, q3, ithresh, thresh);
884
      const uint8x16_t hev_mask = NeedsHev_NEON(p1, p0, q0, q1, hev_thresh);
885
      // p3 and p2 are not just temporary variables here: they will be
886
      // re-used for next span. And q2/q3 will become p1/p0 accordingly.
887
      DoFilter4_NEON(p1, p0, q0, q1, mask, hev_mask, &p1, &p0, &p3, &p2);
888
      Store16x4_NEON(p1, p0, p3, p2, p, stride);
889
      p1 = q2;
890
      p0 = q3;
891
    }
892
  }
893
}
894

895
#if !defined(WORK_AROUND_GCC)
896
static void HFilter16i_NEON(uint8_t* p, int stride,
897
                            int thresh, int ithresh, int hev_thresh) {
898
  uint32_t k;
899
  uint8x16_t p3, p2, p1, p0;
900
  Load4x16_NEON(p + 2, stride, &p3, &p2, &p1, &p0);
901
  for (k = 3; k != 0; --k) {
902
    uint8x16_t q0, q1, q2, q3;
903
    p += 4;
904
    Load4x16_NEON(p + 2, stride, &q0, &q1, &q2, &q3);
905
    {
906
      const uint8x16_t mask =
907
          NeedsFilter2_NEON(p3, p2, p1, p0, q0, q1, q2, q3, ithresh, thresh);
908
      const uint8x16_t hev_mask = NeedsHev_NEON(p1, p0, q0, q1, hev_thresh);
909
      DoFilter4_NEON(p1, p0, q0, q1, mask, hev_mask, &p1, &p0, &p3, &p2);
910
      Store4x16_NEON(p1, p0, p3, p2, p, stride);
911
      p1 = q2;
912
      p0 = q3;
913
    }
914
  }
915
}
916
#endif  // !WORK_AROUND_GCC
917

918
// 8-pixels wide variant, for chroma filtering
919
static void VFilter8_NEON(uint8_t* u, uint8_t* v, int stride,
920
                          int thresh, int ithresh, int hev_thresh) {
921
  uint8x16_t p3, p2, p1, p0, q0, q1, q2, q3;
922
  Load8x8x2_NEON(u, v, stride, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
923
  {
924
    const uint8x16_t mask = NeedsFilter2_NEON(p3, p2, p1, p0, q0, q1, q2, q3,
925
                                              ithresh, thresh);
926
    const uint8x16_t hev_mask = NeedsHev_NEON(p1, p0, q0, q1, hev_thresh);
927
    uint8x16_t op2, op1, op0, oq0, oq1, oq2;
928
    DoFilter6_NEON(p2, p1, p0, q0, q1, q2, mask, hev_mask,
929
                   &op2, &op1, &op0, &oq0, &oq1, &oq2);
930
    Store8x2x2_NEON(op2, op1, u - 2 * stride, v - 2 * stride, stride);
931
    Store8x2x2_NEON(op0, oq0, u + 0 * stride, v + 0 * stride, stride);
932
    Store8x2x2_NEON(oq1, oq2, u + 2 * stride, v + 2 * stride, stride);
933
  }
934
}
935
static void VFilter8i_NEON(uint8_t* u, uint8_t* v, int stride,
936
                           int thresh, int ithresh, int hev_thresh) {
937
  uint8x16_t p3, p2, p1, p0, q0, q1, q2, q3;
938
  u += 4 * stride;
939
  v += 4 * stride;
940
  Load8x8x2_NEON(u, v, stride, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
941
  {
942
    const uint8x16_t mask = NeedsFilter2_NEON(p3, p2, p1, p0, q0, q1, q2, q3,
943
                                              ithresh, thresh);
944
    const uint8x16_t hev_mask = NeedsHev_NEON(p1, p0, q0, q1, hev_thresh);
945
    uint8x16_t op1, op0, oq0, oq1;
946
    DoFilter4_NEON(p1, p0, q0, q1, mask, hev_mask, &op1, &op0, &oq0, &oq1);
947
    Store8x4x2_NEON(op1, op0, oq0, oq1, u, v, stride);
948
  }
949
}
950

951
#if !defined(WORK_AROUND_GCC)
952
static void HFilter8_NEON(uint8_t* u, uint8_t* v, int stride,
953
                          int thresh, int ithresh, int hev_thresh) {
954
  uint8x16_t p3, p2, p1, p0, q0, q1, q2, q3;
955
  Load8x8x2T_NEON(u, v, stride, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
956
  {
957
    const uint8x16_t mask = NeedsFilter2_NEON(p3, p2, p1, p0, q0, q1, q2, q3,
958
                                              ithresh, thresh);
959
    const uint8x16_t hev_mask = NeedsHev_NEON(p1, p0, q0, q1, hev_thresh);
960
    uint8x16_t op2, op1, op0, oq0, oq1, oq2;
961
    DoFilter6_NEON(p2, p1, p0, q0, q1, q2, mask, hev_mask,
962
                   &op2, &op1, &op0, &oq0, &oq1, &oq2);
963
    Store6x8x2_NEON(op2, op1, op0, oq0, oq1, oq2, u, v, stride);
964
  }
965
}
966

967
static void HFilter8i_NEON(uint8_t* u, uint8_t* v, int stride,
968
                           int thresh, int ithresh, int hev_thresh) {
969
  uint8x16_t p3, p2, p1, p0, q0, q1, q2, q3;
970
  u += 4;
971
  v += 4;
972
  Load8x8x2T_NEON(u, v, stride, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
973
  {
974
    const uint8x16_t mask = NeedsFilter2_NEON(p3, p2, p1, p0, q0, q1, q2, q3,
975
                                              ithresh, thresh);
976
    const uint8x16_t hev_mask = NeedsHev_NEON(p1, p0, q0, q1, hev_thresh);
977
    uint8x16_t op1, op0, oq0, oq1;
978
    DoFilter4_NEON(p1, p0, q0, q1, mask, hev_mask, &op1, &op0, &oq0, &oq1);
979
    Store4x8x2_NEON(op1, op0, oq0, oq1, u, v, stride);
980
  }
981
}
982
#endif  // !WORK_AROUND_GCC
983

984
//-----------------------------------------------------------------------------
985
// Inverse transforms (Paragraph 14.4)
986

987
// Technically these are unsigned but vqdmulh is only available in signed.
988
// vqdmulh returns high half (effectively >> 16) but also doubles the value,
989
// changing the >> 16 to >> 15 and requiring an additional >> 1.
990
// We use this to our advantage with kC2. The canonical value is 35468.
991
// However, the high bit is set so treating it as signed will give incorrect
992
// results. We avoid this by down shifting by 1 here to clear the highest bit.
993
// Combined with the doubling effect of vqdmulh we get >> 16.
994
// This can not be applied to kC1 because the lowest bit is set. Down shifting
995
// the constant would reduce precision.
996

997
// libwebp uses a trick to avoid some extra addition that libvpx does.
998
// Instead of:
999
// temp2 = ip[12] + ((ip[12] * cospi8sqrt2minus1) >> 16);
1000
// libwebp adds 1 << 16 to cospi8sqrt2minus1 (kC1). However, this causes the
1001
// same issue with kC1 and vqdmulh that we work around by down shifting kC2
1002

1003
static const int16_t kC1 = 20091;
1004
static const int16_t kC2 = 17734;  // half of kC2, actually. See comment above.
1005

1006
#if defined(WEBP_USE_INTRINSICS)
1007
static WEBP_INLINE void Transpose8x2_NEON(const int16x8_t in0,
1008
                                          const int16x8_t in1,
1009
                                          int16x8x2_t* const out) {
1010
  // a0 a1 a2 a3 | b0 b1 b2 b3   => a0 b0 c0 d0 | a1 b1 c1 d1
1011
  // c0 c1 c2 c3 | d0 d1 d2 d3      a2 b2 c2 d2 | a3 b3 c3 d3
1012
  const int16x8x2_t tmp0 = vzipq_s16(in0, in1);   // a0 c0 a1 c1 a2 c2 ...
1013
                                                  // b0 d0 b1 d1 b2 d2 ...
1014
  *out = vzipq_s16(tmp0.val[0], tmp0.val[1]);
1015
}
1016

1017
static WEBP_INLINE void TransformPass_NEON(int16x8x2_t* const rows) {
1018
  // {rows} = in0 | in4
1019
  //          in8 | in12
1020
  // B1 = in4 | in12
1021
  const int16x8_t B1 =
1022
      vcombine_s16(vget_high_s16(rows->val[0]), vget_high_s16(rows->val[1]));
1023
  // C0 = kC1 * in4 | kC1 * in12
1024
  // C1 = kC2 * in4 | kC2 * in12
1025
  const int16x8_t C0 = vsraq_n_s16(B1, vqdmulhq_n_s16(B1, kC1), 1);
1026
  const int16x8_t C1 = vqdmulhq_n_s16(B1, kC2);
1027
  const int16x4_t a = vqadd_s16(vget_low_s16(rows->val[0]),
1028
                                vget_low_s16(rows->val[1]));   // in0 + in8
1029
  const int16x4_t b = vqsub_s16(vget_low_s16(rows->val[0]),
1030
                                vget_low_s16(rows->val[1]));   // in0 - in8
1031
  // c = kC2 * in4 - kC1 * in12
1032
  // d = kC1 * in4 + kC2 * in12
1033
  const int16x4_t c = vqsub_s16(vget_low_s16(C1), vget_high_s16(C0));
1034
  const int16x4_t d = vqadd_s16(vget_low_s16(C0), vget_high_s16(C1));
1035
  const int16x8_t D0 = vcombine_s16(a, b);      // D0 = a | b
1036
  const int16x8_t D1 = vcombine_s16(d, c);      // D1 = d | c
1037
  const int16x8_t E0 = vqaddq_s16(D0, D1);      // a+d | b+c
1038
  const int16x8_t E_tmp = vqsubq_s16(D0, D1);   // a-d | b-c
1039
  const int16x8_t E1 = vcombine_s16(vget_high_s16(E_tmp), vget_low_s16(E_tmp));
1040
  Transpose8x2_NEON(E0, E1, rows);
1041
}
1042

1043
static void TransformOne_NEON(const int16_t* in, uint8_t* dst) {
1044
  int16x8x2_t rows;
1045
  INIT_VECTOR2(rows, vld1q_s16(in + 0), vld1q_s16(in + 8));
1046
  TransformPass_NEON(&rows);
1047
  TransformPass_NEON(&rows);
1048
  Add4x4_NEON(rows.val[0], rows.val[1], dst);
1049
}
1050

1051
#else
1052

1053
static void TransformOne_NEON(const int16_t* in, uint8_t* dst) {
1054
  const int kBPS = BPS;
1055
  // kC1, kC2. Padded because vld1.16 loads 8 bytes
1056
  const int16_t constants[4] = { kC1, kC2, 0, 0 };
1057
  /* Adapted from libvpx: vp8/common/arm/neon/shortidct4x4llm_neon.asm */
1058
  __asm__ volatile (
1059
    "vld1.16         {q1, q2}, [%[in]]           \n"
1060
    "vld1.16         {d0}, [%[constants]]        \n"
1061

1062
    /* d2: in[0]
1063
     * d3: in[8]
1064
     * d4: in[4]
1065
     * d5: in[12]
1066
     */
1067
    "vswp            d3, d4                      \n"
1068

1069
    /* q8 = {in[4], in[12]} * kC1 * 2 >> 16
1070
     * q9 = {in[4], in[12]} * kC2 >> 16
1071
     */
1072
    "vqdmulh.s16     q8, q2, d0[0]               \n"
1073
    "vqdmulh.s16     q9, q2, d0[1]               \n"
1074

1075
    /* d22 = a = in[0] + in[8]
1076
     * d23 = b = in[0] - in[8]
1077
     */
1078
    "vqadd.s16       d22, d2, d3                 \n"
1079
    "vqsub.s16       d23, d2, d3                 \n"
1080

1081
    /* The multiplication should be x * kC1 >> 16
1082
     * However, with vqdmulh we get x * kC1 * 2 >> 16
1083
     * (multiply, double, return high half)
1084
     * We avoided this in kC2 by pre-shifting the constant.
1085
     * q8 = in[4]/[12] * kC1 >> 16
1086
     */
1087
    "vshr.s16        q8, q8, #1                  \n"
1088

1089
    /* Add {in[4], in[12]} back after the multiplication. This is handled by
1090
     * adding 1 << 16 to kC1 in the libwebp C code.
1091
     */
1092
    "vqadd.s16       q8, q2, q8                  \n"
1093

1094
    /* d20 = c = in[4]*kC2 - in[12]*kC1
1095
     * d21 = d = in[4]*kC1 + in[12]*kC2
1096
     */
1097
    "vqsub.s16       d20, d18, d17               \n"
1098
    "vqadd.s16       d21, d19, d16               \n"
1099

1100
    /* d2 = tmp[0] = a + d
1101
     * d3 = tmp[1] = b + c
1102
     * d4 = tmp[2] = b - c
1103
     * d5 = tmp[3] = a - d
1104
     */
1105
    "vqadd.s16       d2, d22, d21                \n"
1106
    "vqadd.s16       d3, d23, d20                \n"
1107
    "vqsub.s16       d4, d23, d20                \n"
1108
    "vqsub.s16       d5, d22, d21                \n"
1109

1110
    "vzip.16         q1, q2                      \n"
1111
    "vzip.16         q1, q2                      \n"
1112

1113
    "vswp            d3, d4                      \n"
1114

1115
    /* q8 = {tmp[4], tmp[12]} * kC1 * 2 >> 16
1116
     * q9 = {tmp[4], tmp[12]} * kC2 >> 16
1117
     */
1118
    "vqdmulh.s16     q8, q2, d0[0]               \n"
1119
    "vqdmulh.s16     q9, q2, d0[1]               \n"
1120

1121
    /* d22 = a = tmp[0] + tmp[8]
1122
     * d23 = b = tmp[0] - tmp[8]
1123
     */
1124
    "vqadd.s16       d22, d2, d3                 \n"
1125
    "vqsub.s16       d23, d2, d3                 \n"
1126

1127
    /* See long winded explanations prior */
1128
    "vshr.s16        q8, q8, #1                  \n"
1129
    "vqadd.s16       q8, q2, q8                  \n"
1130

1131
    /* d20 = c = in[4]*kC2 - in[12]*kC1
1132
     * d21 = d = in[4]*kC1 + in[12]*kC2
1133
     */
1134
    "vqsub.s16       d20, d18, d17               \n"
1135
    "vqadd.s16       d21, d19, d16               \n"
1136

1137
    /* d2 = tmp[0] = a + d
1138
     * d3 = tmp[1] = b + c
1139
     * d4 = tmp[2] = b - c
1140
     * d5 = tmp[3] = a - d
1141
     */
1142
    "vqadd.s16       d2, d22, d21                \n"
1143
    "vqadd.s16       d3, d23, d20                \n"
1144
    "vqsub.s16       d4, d23, d20                \n"
1145
    "vqsub.s16       d5, d22, d21                \n"
1146

1147
    "vld1.32         d6[0], [%[dst]], %[kBPS]    \n"
1148
    "vld1.32         d6[1], [%[dst]], %[kBPS]    \n"
1149
    "vld1.32         d7[0], [%[dst]], %[kBPS]    \n"
1150
    "vld1.32         d7[1], [%[dst]], %[kBPS]    \n"
1151

1152
    "sub         %[dst], %[dst], %[kBPS], lsl #2 \n"
1153

1154
    /* (val) + 4 >> 3 */
1155
    "vrshr.s16       d2, d2, #3                  \n"
1156
    "vrshr.s16       d3, d3, #3                  \n"
1157
    "vrshr.s16       d4, d4, #3                  \n"
1158
    "vrshr.s16       d5, d5, #3                  \n"
1159

1160
    "vzip.16         q1, q2                      \n"
1161
    "vzip.16         q1, q2                      \n"
1162

1163
    /* Must accumulate before saturating */
1164
    "vmovl.u8        q8, d6                      \n"
1165
    "vmovl.u8        q9, d7                      \n"
1166

1167
    "vqadd.s16       q1, q1, q8                  \n"
1168
    "vqadd.s16       q2, q2, q9                  \n"
1169

1170
    "vqmovun.s16     d0, q1                      \n"
1171
    "vqmovun.s16     d1, q2                      \n"
1172

1173
    "vst1.32         d0[0], [%[dst]], %[kBPS]    \n"
1174
    "vst1.32         d0[1], [%[dst]], %[kBPS]    \n"
1175
    "vst1.32         d1[0], [%[dst]], %[kBPS]    \n"
1176
    "vst1.32         d1[1], [%[dst]]             \n"
1177

1178
    : [in] "+r"(in), [dst] "+r"(dst)  /* modified registers */
1179
    : [kBPS] "r"(kBPS), [constants] "r"(constants)  /* constants */
1180
    : "memory", "q0", "q1", "q2", "q8", "q9", "q10", "q11"  /* clobbered */
1181
  );
1182
}
1183

1184
#endif    // WEBP_USE_INTRINSICS
1185

1186
static void TransformTwo_NEON(const int16_t* in, uint8_t* dst, int do_two) {
1187
  TransformOne_NEON(in, dst);
1188
  if (do_two) {
1189
    TransformOne_NEON(in + 16, dst + 4);
1190
  }
1191
}
1192

1193
static void TransformDC_NEON(const int16_t* in, uint8_t* dst) {
1194
  const int16x8_t DC = vdupq_n_s16(in[0]);
1195
  Add4x4_NEON(DC, DC, dst);
1196
}
1197

1198
//------------------------------------------------------------------------------
1199

1200
#define STORE_WHT(dst, col, rows) do {                  \
1201
  *dst = vgetq_lane_s32(rows.val[0], col); (dst) += 16; \
1202
  *dst = vgetq_lane_s32(rows.val[1], col); (dst) += 16; \
1203
  *dst = vgetq_lane_s32(rows.val[2], col); (dst) += 16; \
1204
  *dst = vgetq_lane_s32(rows.val[3], col); (dst) += 16; \
1205
} while (0)
1206

1207
static void TransformWHT_NEON(const int16_t* in, int16_t* out) {
1208
  int32x4x4_t tmp;
1209

1210
  {
1211
    // Load the source.
1212
    const int16x4_t in00_03 = vld1_s16(in + 0);
1213
    const int16x4_t in04_07 = vld1_s16(in + 4);
1214
    const int16x4_t in08_11 = vld1_s16(in + 8);
1215
    const int16x4_t in12_15 = vld1_s16(in + 12);
1216
    const int32x4_t a0 = vaddl_s16(in00_03, in12_15);  // in[0..3] + in[12..15]
1217
    const int32x4_t a1 = vaddl_s16(in04_07, in08_11);  // in[4..7] + in[8..11]
1218
    const int32x4_t a2 = vsubl_s16(in04_07, in08_11);  // in[4..7] - in[8..11]
1219
    const int32x4_t a3 = vsubl_s16(in00_03, in12_15);  // in[0..3] - in[12..15]
1220
    tmp.val[0] = vaddq_s32(a0, a1);
1221
    tmp.val[1] = vaddq_s32(a3, a2);
1222
    tmp.val[2] = vsubq_s32(a0, a1);
1223
    tmp.val[3] = vsubq_s32(a3, a2);
1224
    // Arrange the temporary results column-wise.
1225
    tmp = Transpose4x4_NEON(tmp);
1226
  }
1227

1228
  {
1229
    const int32x4_t kCst3 = vdupq_n_s32(3);
1230
    const int32x4_t dc = vaddq_s32(tmp.val[0], kCst3);  // add rounder
1231
    const int32x4_t a0 = vaddq_s32(dc, tmp.val[3]);
1232
    const int32x4_t a1 = vaddq_s32(tmp.val[1], tmp.val[2]);
1233
    const int32x4_t a2 = vsubq_s32(tmp.val[1], tmp.val[2]);
1234
    const int32x4_t a3 = vsubq_s32(dc, tmp.val[3]);
1235

1236
    tmp.val[0] = vaddq_s32(a0, a1);
1237
    tmp.val[1] = vaddq_s32(a3, a2);
1238
    tmp.val[2] = vsubq_s32(a0, a1);
1239
    tmp.val[3] = vsubq_s32(a3, a2);
1240

1241
    // right shift the results by 3.
1242
    tmp.val[0] = vshrq_n_s32(tmp.val[0], 3);
1243
    tmp.val[1] = vshrq_n_s32(tmp.val[1], 3);
1244
    tmp.val[2] = vshrq_n_s32(tmp.val[2], 3);
1245
    tmp.val[3] = vshrq_n_s32(tmp.val[3], 3);
1246

1247
    STORE_WHT(out, 0, tmp);
1248
    STORE_WHT(out, 1, tmp);
1249
    STORE_WHT(out, 2, tmp);
1250
    STORE_WHT(out, 3, tmp);
1251
  }
1252
}
1253

1254
#undef STORE_WHT
1255

1256
//------------------------------------------------------------------------------
1257

1258
#define MUL(a, b) (((a) * (b)) >> 16)
1259
static void TransformAC3_NEON(const int16_t* in, uint8_t* dst) {
1260
  static const int kC1_full = 20091 + (1 << 16);
1261
  static const int kC2_full = 35468;
1262
  const int16x4_t A = vld1_dup_s16(in);
1263
  const int16x4_t c4 = vdup_n_s16(MUL(in[4], kC2_full));
1264
  const int16x4_t d4 = vdup_n_s16(MUL(in[4], kC1_full));
1265
  const int c1 = MUL(in[1], kC2_full);
1266
  const int d1 = MUL(in[1], kC1_full);
1267
  const uint64_t cd = (uint64_t)( d1 & 0xffff) <<  0 |
1268
                      (uint64_t)( c1 & 0xffff) << 16 |
1269
                      (uint64_t)(-c1 & 0xffff) << 32 |
1270
                      (uint64_t)(-d1 & 0xffff) << 48;
1271
  const int16x4_t CD = vcreate_s16(cd);
1272
  const int16x4_t B = vqadd_s16(A, CD);
1273
  const int16x8_t m0_m1 = vcombine_s16(vqadd_s16(B, d4), vqadd_s16(B, c4));
1274
  const int16x8_t m2_m3 = vcombine_s16(vqsub_s16(B, c4), vqsub_s16(B, d4));
1275
  Add4x4_NEON(m0_m1, m2_m3, dst);
1276
}
1277
#undef MUL
1278

1279
//------------------------------------------------------------------------------
1280
// 4x4
1281

1282
static void DC4_NEON(uint8_t* dst) {    // DC
1283
  const uint8x8_t A = vld1_u8(dst - BPS);  // top row
1284
  const uint16x4_t p0 = vpaddl_u8(A);  // cascading summation of the top
1285
  const uint16x4_t p1 = vpadd_u16(p0, p0);
1286
  const uint16x8_t L0 = vmovl_u8(vld1_u8(dst + 0 * BPS - 1));
1287
  const uint16x8_t L1 = vmovl_u8(vld1_u8(dst + 1 * BPS - 1));
1288
  const uint16x8_t L2 = vmovl_u8(vld1_u8(dst + 2 * BPS - 1));
1289
  const uint16x8_t L3 = vmovl_u8(vld1_u8(dst + 3 * BPS - 1));
1290
  const uint16x8_t s0 = vaddq_u16(L0, L1);
1291
  const uint16x8_t s1 = vaddq_u16(L2, L3);
1292
  const uint16x8_t s01 = vaddq_u16(s0, s1);
1293
  const uint16x8_t sum = vaddq_u16(s01, vcombine_u16(p1, p1));
1294
  const uint8x8_t dc0 = vrshrn_n_u16(sum, 3);  // (sum + 4) >> 3
1295
  const uint8x8_t dc = vdup_lane_u8(dc0, 0);
1296
  int i;
1297
  for (i = 0; i < 4; ++i) {
1298
    vst1_lane_u32((uint32_t*)(dst + i * BPS), vreinterpret_u32_u8(dc), 0);
1299
  }
1300
}
1301

1302
// TrueMotion (4x4 + 8x8)
1303
static WEBP_INLINE void TrueMotion_NEON(uint8_t* dst, int size) {
1304
  const uint8x8_t TL = vld1_dup_u8(dst - BPS - 1);  // top-left pixel 'A[-1]'
1305
  const uint8x8_t T = vld1_u8(dst - BPS);  // top row 'A[0..3]'
1306
  const int16x8_t d = vreinterpretq_s16_u16(vsubl_u8(T, TL));  // A[c] - A[-1]
1307
  int y;
1308
  for (y = 0; y < size; y += 4) {
1309
    // left edge
1310
    const int16x8_t L0 = ConvertU8ToS16_NEON(vld1_dup_u8(dst + 0 * BPS - 1));
1311
    const int16x8_t L1 = ConvertU8ToS16_NEON(vld1_dup_u8(dst + 1 * BPS - 1));
1312
    const int16x8_t L2 = ConvertU8ToS16_NEON(vld1_dup_u8(dst + 2 * BPS - 1));
1313
    const int16x8_t L3 = ConvertU8ToS16_NEON(vld1_dup_u8(dst + 3 * BPS - 1));
1314
    const int16x8_t r0 = vaddq_s16(L0, d);  // L[r] + A[c] - A[-1]
1315
    const int16x8_t r1 = vaddq_s16(L1, d);
1316
    const int16x8_t r2 = vaddq_s16(L2, d);
1317
    const int16x8_t r3 = vaddq_s16(L3, d);
1318
    // Saturate and store the result.
1319
    const uint32x2_t r0_u32 = vreinterpret_u32_u8(vqmovun_s16(r0));
1320
    const uint32x2_t r1_u32 = vreinterpret_u32_u8(vqmovun_s16(r1));
1321
    const uint32x2_t r2_u32 = vreinterpret_u32_u8(vqmovun_s16(r2));
1322
    const uint32x2_t r3_u32 = vreinterpret_u32_u8(vqmovun_s16(r3));
1323
    if (size == 4) {
1324
      vst1_lane_u32((uint32_t*)(dst + 0 * BPS), r0_u32, 0);
1325
      vst1_lane_u32((uint32_t*)(dst + 1 * BPS), r1_u32, 0);
1326
      vst1_lane_u32((uint32_t*)(dst + 2 * BPS), r2_u32, 0);
1327
      vst1_lane_u32((uint32_t*)(dst + 3 * BPS), r3_u32, 0);
1328
    } else {
1329
      vst1_u32((uint32_t*)(dst + 0 * BPS), r0_u32);
1330
      vst1_u32((uint32_t*)(dst + 1 * BPS), r1_u32);
1331
      vst1_u32((uint32_t*)(dst + 2 * BPS), r2_u32);
1332
      vst1_u32((uint32_t*)(dst + 3 * BPS), r3_u32);
1333
    }
1334
    dst += 4 * BPS;
1335
  }
1336
}
1337

1338
static void TM4_NEON(uint8_t* dst) { TrueMotion_NEON(dst, 4); }
1339

1340
static void VE4_NEON(uint8_t* dst) {    // vertical
1341
  // NB: avoid vld1_u64 here as an alignment hint may be added -> SIGBUS.
1342
  const uint64x1_t A0 = vreinterpret_u64_u8(vld1_u8(dst - BPS - 1));  // top row
1343
  const uint64x1_t A1 = vshr_n_u64(A0, 8);
1344
  const uint64x1_t A2 = vshr_n_u64(A0, 16);
1345
  const uint8x8_t ABCDEFGH = vreinterpret_u8_u64(A0);
1346
  const uint8x8_t BCDEFGH0 = vreinterpret_u8_u64(A1);
1347
  const uint8x8_t CDEFGH00 = vreinterpret_u8_u64(A2);
1348
  const uint8x8_t b = vhadd_u8(ABCDEFGH, CDEFGH00);
1349
  const uint8x8_t avg = vrhadd_u8(b, BCDEFGH0);
1350
  int i;
1351
  for (i = 0; i < 4; ++i) {
1352
    vst1_lane_u32((uint32_t*)(dst + i * BPS), vreinterpret_u32_u8(avg), 0);
1353
  }
1354
}
1355

1356
static void RD4_NEON(uint8_t* dst) {   // Down-right
1357
  const uint8x8_t XABCD_u8 = vld1_u8(dst - BPS - 1);
1358
  const uint64x1_t XABCD = vreinterpret_u64_u8(XABCD_u8);
1359
  const uint64x1_t ____XABC = vshl_n_u64(XABCD, 32);
1360
  const uint32_t I = dst[-1 + 0 * BPS];
1361
  const uint32_t J = dst[-1 + 1 * BPS];
1362
  const uint32_t K = dst[-1 + 2 * BPS];
1363
  const uint32_t L = dst[-1 + 3 * BPS];
1364
  const uint64x1_t LKJI____ = vcreate_u64(L | (K << 8) | (J << 16) | (I << 24));
1365
  const uint64x1_t LKJIXABC = vorr_u64(LKJI____, ____XABC);
1366
  const uint8x8_t KJIXABC_ = vreinterpret_u8_u64(vshr_n_u64(LKJIXABC, 8));
1367
  const uint8x8_t JIXABC__ = vreinterpret_u8_u64(vshr_n_u64(LKJIXABC, 16));
1368
  const uint8_t D = vget_lane_u8(XABCD_u8, 4);
1369
  const uint8x8_t JIXABCD_ = vset_lane_u8(D, JIXABC__, 6);
1370
  const uint8x8_t LKJIXABC_u8 = vreinterpret_u8_u64(LKJIXABC);
1371
  const uint8x8_t avg1 = vhadd_u8(JIXABCD_, LKJIXABC_u8);
1372
  const uint8x8_t avg2 = vrhadd_u8(avg1, KJIXABC_);
1373
  const uint64x1_t avg2_u64 = vreinterpret_u64_u8(avg2);
1374
  const uint32x2_t r3 = vreinterpret_u32_u8(avg2);
1375
  const uint32x2_t r2 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 8));
1376
  const uint32x2_t r1 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 16));
1377
  const uint32x2_t r0 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 24));
1378
  vst1_lane_u32((uint32_t*)(dst + 0 * BPS), r0, 0);
1379
  vst1_lane_u32((uint32_t*)(dst + 1 * BPS), r1, 0);
1380
  vst1_lane_u32((uint32_t*)(dst + 2 * BPS), r2, 0);
1381
  vst1_lane_u32((uint32_t*)(dst + 3 * BPS), r3, 0);
1382
}
1383

1384
static void LD4_NEON(uint8_t* dst) {    // Down-left
1385
  // Note using the same shift trick as VE4() is slower here.
1386
  const uint8x8_t ABCDEFGH = vld1_u8(dst - BPS + 0);
1387
  const uint8x8_t BCDEFGH0 = vld1_u8(dst - BPS + 1);
1388
  const uint8x8_t CDEFGH00 = vld1_u8(dst - BPS + 2);
1389
  const uint8x8_t CDEFGHH0 = vset_lane_u8(dst[-BPS + 7], CDEFGH00, 6);
1390
  const uint8x8_t avg1 = vhadd_u8(ABCDEFGH, CDEFGHH0);
1391
  const uint8x8_t avg2 = vrhadd_u8(avg1, BCDEFGH0);
1392
  const uint64x1_t avg2_u64 = vreinterpret_u64_u8(avg2);
1393
  const uint32x2_t r0 = vreinterpret_u32_u8(avg2);
1394
  const uint32x2_t r1 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 8));
1395
  const uint32x2_t r2 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 16));
1396
  const uint32x2_t r3 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 24));
1397
  vst1_lane_u32((uint32_t*)(dst + 0 * BPS), r0, 0);
1398
  vst1_lane_u32((uint32_t*)(dst + 1 * BPS), r1, 0);
1399
  vst1_lane_u32((uint32_t*)(dst + 2 * BPS), r2, 0);
1400
  vst1_lane_u32((uint32_t*)(dst + 3 * BPS), r3, 0);
1401
}
1402

1403
//------------------------------------------------------------------------------
1404
// Chroma
1405

1406
static void VE8uv_NEON(uint8_t* dst) {    // vertical
1407
  const uint8x8_t top = vld1_u8(dst - BPS);
1408
  int j;
1409
  for (j = 0; j < 8; ++j) {
1410
    vst1_u8(dst + j * BPS, top);
1411
  }
1412
}
1413

1414
static void HE8uv_NEON(uint8_t* dst) {    // horizontal
1415
  int j;
1416
  for (j = 0; j < 8; ++j) {
1417
    const uint8x8_t left = vld1_dup_u8(dst - 1);
1418
    vst1_u8(dst, left);
1419
    dst += BPS;
1420
  }
1421
}
1422

1423
static WEBP_INLINE void DC8_NEON(uint8_t* dst, int do_top, int do_left) {
1424
  uint16x8_t sum_top;
1425
  uint16x8_t sum_left;
1426
  uint8x8_t dc0;
1427

1428
  if (do_top) {
1429
    const uint8x8_t A = vld1_u8(dst - BPS);  // top row
1430
    const uint16x4_t p0 = vpaddl_u8(A);  // cascading summation of the top
1431
    const uint16x4_t p1 = vpadd_u16(p0, p0);
1432
    const uint16x4_t p2 = vpadd_u16(p1, p1);
1433
    sum_top = vcombine_u16(p2, p2);
1434
  }
1435

1436
  if (do_left) {
1437
    const uint16x8_t L0 = vmovl_u8(vld1_u8(dst + 0 * BPS - 1));
1438
    const uint16x8_t L1 = vmovl_u8(vld1_u8(dst + 1 * BPS - 1));
1439
    const uint16x8_t L2 = vmovl_u8(vld1_u8(dst + 2 * BPS - 1));
1440
    const uint16x8_t L3 = vmovl_u8(vld1_u8(dst + 3 * BPS - 1));
1441
    const uint16x8_t L4 = vmovl_u8(vld1_u8(dst + 4 * BPS - 1));
1442
    const uint16x8_t L5 = vmovl_u8(vld1_u8(dst + 5 * BPS - 1));
1443
    const uint16x8_t L6 = vmovl_u8(vld1_u8(dst + 6 * BPS - 1));
1444
    const uint16x8_t L7 = vmovl_u8(vld1_u8(dst + 7 * BPS - 1));
1445
    const uint16x8_t s0 = vaddq_u16(L0, L1);
1446
    const uint16x8_t s1 = vaddq_u16(L2, L3);
1447
    const uint16x8_t s2 = vaddq_u16(L4, L5);
1448
    const uint16x8_t s3 = vaddq_u16(L6, L7);
1449
    const uint16x8_t s01 = vaddq_u16(s0, s1);
1450
    const uint16x8_t s23 = vaddq_u16(s2, s3);
1451
    sum_left = vaddq_u16(s01, s23);
1452
  }
1453

1454
  if (do_top && do_left) {
1455
    const uint16x8_t sum = vaddq_u16(sum_left, sum_top);
1456
    dc0 = vrshrn_n_u16(sum, 4);
1457
  } else if (do_top) {
1458
    dc0 = vrshrn_n_u16(sum_top, 3);
1459
  } else if (do_left) {
1460
    dc0 = vrshrn_n_u16(sum_left, 3);
1461
  } else {
1462
    dc0 = vdup_n_u8(0x80);
1463
  }
1464

1465
  {
1466
    const uint8x8_t dc = vdup_lane_u8(dc0, 0);
1467
    int i;
1468
    for (i = 0; i < 8; ++i) {
1469
      vst1_u32((uint32_t*)(dst + i * BPS), vreinterpret_u32_u8(dc));
1470
    }
1471
  }
1472
}
1473

1474
static void DC8uv_NEON(uint8_t* dst) { DC8_NEON(dst, 1, 1); }
1475
static void DC8uvNoTop_NEON(uint8_t* dst) { DC8_NEON(dst, 0, 1); }
1476
static void DC8uvNoLeft_NEON(uint8_t* dst) { DC8_NEON(dst, 1, 0); }
1477
static void DC8uvNoTopLeft_NEON(uint8_t* dst) { DC8_NEON(dst, 0, 0); }
1478

1479
static void TM8uv_NEON(uint8_t* dst) { TrueMotion_NEON(dst, 8); }
1480

1481
//------------------------------------------------------------------------------
1482
// 16x16
1483

1484
static void VE16_NEON(uint8_t* dst) {     // vertical
1485
  const uint8x16_t top = vld1q_u8(dst - BPS);
1486
  int j;
1487
  for (j = 0; j < 16; ++j) {
1488
    vst1q_u8(dst + j * BPS, top);
1489
  }
1490
}
1491

1492
static void HE16_NEON(uint8_t* dst) {     // horizontal
1493
  int j;
1494
  for (j = 0; j < 16; ++j) {
1495
    const uint8x16_t left = vld1q_dup_u8(dst - 1);
1496
    vst1q_u8(dst, left);
1497
    dst += BPS;
1498
  }
1499
}
1500

1501
static WEBP_INLINE void DC16_NEON(uint8_t* dst, int do_top, int do_left) {
1502
  uint16x8_t sum_top;
1503
  uint16x8_t sum_left;
1504
  uint8x8_t dc0;
1505

1506
  if (do_top) {
1507
    const uint8x16_t A = vld1q_u8(dst - BPS);  // top row
1508
    const uint16x8_t p0 = vpaddlq_u8(A);  // cascading summation of the top
1509
    const uint16x4_t p1 = vadd_u16(vget_low_u16(p0), vget_high_u16(p0));
1510
    const uint16x4_t p2 = vpadd_u16(p1, p1);
1511
    const uint16x4_t p3 = vpadd_u16(p2, p2);
1512
    sum_top = vcombine_u16(p3, p3);
1513
  }
1514

1515
  if (do_left) {
1516
    int i;
1517
    sum_left = vdupq_n_u16(0);
1518
    for (i = 0; i < 16; i += 8) {
1519
      const uint16x8_t L0 = vmovl_u8(vld1_u8(dst + (i + 0) * BPS - 1));
1520
      const uint16x8_t L1 = vmovl_u8(vld1_u8(dst + (i + 1) * BPS - 1));
1521
      const uint16x8_t L2 = vmovl_u8(vld1_u8(dst + (i + 2) * BPS - 1));
1522
      const uint16x8_t L3 = vmovl_u8(vld1_u8(dst + (i + 3) * BPS - 1));
1523
      const uint16x8_t L4 = vmovl_u8(vld1_u8(dst + (i + 4) * BPS - 1));
1524
      const uint16x8_t L5 = vmovl_u8(vld1_u8(dst + (i + 5) * BPS - 1));
1525
      const uint16x8_t L6 = vmovl_u8(vld1_u8(dst + (i + 6) * BPS - 1));
1526
      const uint16x8_t L7 = vmovl_u8(vld1_u8(dst + (i + 7) * BPS - 1));
1527
      const uint16x8_t s0 = vaddq_u16(L0, L1);
1528
      const uint16x8_t s1 = vaddq_u16(L2, L3);
1529
      const uint16x8_t s2 = vaddq_u16(L4, L5);
1530
      const uint16x8_t s3 = vaddq_u16(L6, L7);
1531
      const uint16x8_t s01 = vaddq_u16(s0, s1);
1532
      const uint16x8_t s23 = vaddq_u16(s2, s3);
1533
      const uint16x8_t sum = vaddq_u16(s01, s23);
1534
      sum_left = vaddq_u16(sum_left, sum);
1535
    }
1536
  }
1537

1538
  if (do_top && do_left) {
1539
    const uint16x8_t sum = vaddq_u16(sum_left, sum_top);
1540
    dc0 = vrshrn_n_u16(sum, 5);
1541
  } else if (do_top) {
1542
    dc0 = vrshrn_n_u16(sum_top, 4);
1543
  } else if (do_left) {
1544
    dc0 = vrshrn_n_u16(sum_left, 4);
1545
  } else {
1546
    dc0 = vdup_n_u8(0x80);
1547
  }
1548

1549
  {
1550
    const uint8x16_t dc = vdupq_lane_u8(dc0, 0);
1551
    int i;
1552
    for (i = 0; i < 16; ++i) {
1553
      vst1q_u8(dst + i * BPS, dc);
1554
    }
1555
  }
1556
}
1557

1558
static void DC16TopLeft_NEON(uint8_t* dst) { DC16_NEON(dst, 1, 1); }
1559
static void DC16NoTop_NEON(uint8_t* dst) { DC16_NEON(dst, 0, 1); }
1560
static void DC16NoLeft_NEON(uint8_t* dst) { DC16_NEON(dst, 1, 0); }
1561
static void DC16NoTopLeft_NEON(uint8_t* dst) { DC16_NEON(dst, 0, 0); }
1562

1563
static void TM16_NEON(uint8_t* dst) {
1564
  const uint8x8_t TL = vld1_dup_u8(dst - BPS - 1);  // top-left pixel 'A[-1]'
1565
  const uint8x16_t T = vld1q_u8(dst - BPS);  // top row 'A[0..15]'
1566
  // A[c] - A[-1]
1567
  const int16x8_t d_lo = vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(T), TL));
1568
  const int16x8_t d_hi = vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(T), TL));
1569
  int y;
1570
  for (y = 0; y < 16; y += 4) {
1571
    // left edge
1572
    const int16x8_t L0 = ConvertU8ToS16_NEON(vld1_dup_u8(dst + 0 * BPS - 1));
1573
    const int16x8_t L1 = ConvertU8ToS16_NEON(vld1_dup_u8(dst + 1 * BPS - 1));
1574
    const int16x8_t L2 = ConvertU8ToS16_NEON(vld1_dup_u8(dst + 2 * BPS - 1));
1575
    const int16x8_t L3 = ConvertU8ToS16_NEON(vld1_dup_u8(dst + 3 * BPS - 1));
1576
    const int16x8_t r0_lo = vaddq_s16(L0, d_lo);  // L[r] + A[c] - A[-1]
1577
    const int16x8_t r1_lo = vaddq_s16(L1, d_lo);
1578
    const int16x8_t r2_lo = vaddq_s16(L2, d_lo);
1579
    const int16x8_t r3_lo = vaddq_s16(L3, d_lo);
1580
    const int16x8_t r0_hi = vaddq_s16(L0, d_hi);
1581
    const int16x8_t r1_hi = vaddq_s16(L1, d_hi);
1582
    const int16x8_t r2_hi = vaddq_s16(L2, d_hi);
1583
    const int16x8_t r3_hi = vaddq_s16(L3, d_hi);
1584
    // Saturate and store the result.
1585
    const uint8x16_t row0 = vcombine_u8(vqmovun_s16(r0_lo), vqmovun_s16(r0_hi));
1586
    const uint8x16_t row1 = vcombine_u8(vqmovun_s16(r1_lo), vqmovun_s16(r1_hi));
1587
    const uint8x16_t row2 = vcombine_u8(vqmovun_s16(r2_lo), vqmovun_s16(r2_hi));
1588
    const uint8x16_t row3 = vcombine_u8(vqmovun_s16(r3_lo), vqmovun_s16(r3_hi));
1589
    vst1q_u8(dst + 0 * BPS, row0);
1590
    vst1q_u8(dst + 1 * BPS, row1);
1591
    vst1q_u8(dst + 2 * BPS, row2);
1592
    vst1q_u8(dst + 3 * BPS, row3);
1593
    dst += 4 * BPS;
1594
  }
1595
}
1596

1597
//------------------------------------------------------------------------------
1598
// Entry point
1599

1600
extern void VP8DspInitNEON(void);
1601

1602
WEBP_TSAN_IGNORE_FUNCTION void VP8DspInitNEON(void) {
1603
  VP8Transform = TransformTwo_NEON;
1604
  VP8TransformAC3 = TransformAC3_NEON;
1605
  VP8TransformDC = TransformDC_NEON;
1606
  VP8TransformWHT = TransformWHT_NEON;
1607

1608
  VP8VFilter16 = VFilter16_NEON;
1609
  VP8VFilter16i = VFilter16i_NEON;
1610
  VP8HFilter16 = HFilter16_NEON;
1611
#if !defined(WORK_AROUND_GCC)
1612
  VP8HFilter16i = HFilter16i_NEON;
1613
#endif
1614
  VP8VFilter8 = VFilter8_NEON;
1615
  VP8VFilter8i = VFilter8i_NEON;
1616
#if !defined(WORK_AROUND_GCC)
1617
  VP8HFilter8 = HFilter8_NEON;
1618
  VP8HFilter8i = HFilter8i_NEON;
1619
#endif
1620
  VP8SimpleVFilter16 = SimpleVFilter16_NEON;
1621
  VP8SimpleHFilter16 = SimpleHFilter16_NEON;
1622
  VP8SimpleVFilter16i = SimpleVFilter16i_NEON;
1623
  VP8SimpleHFilter16i = SimpleHFilter16i_NEON;
1624

1625
  VP8PredLuma4[0] = DC4_NEON;
1626
  VP8PredLuma4[1] = TM4_NEON;
1627
  VP8PredLuma4[2] = VE4_NEON;
1628
  VP8PredLuma4[4] = RD4_NEON;
1629
  VP8PredLuma4[6] = LD4_NEON;
1630

1631
  VP8PredLuma16[0] = DC16TopLeft_NEON;
1632
  VP8PredLuma16[1] = TM16_NEON;
1633
  VP8PredLuma16[2] = VE16_NEON;
1634
  VP8PredLuma16[3] = HE16_NEON;
1635
  VP8PredLuma16[4] = DC16NoTop_NEON;
1636
  VP8PredLuma16[5] = DC16NoLeft_NEON;
1637
  VP8PredLuma16[6] = DC16NoTopLeft_NEON;
1638

1639
  VP8PredChroma8[0] = DC8uv_NEON;
1640
  VP8PredChroma8[1] = TM8uv_NEON;
1641
  VP8PredChroma8[2] = VE8uv_NEON;
1642
  VP8PredChroma8[3] = HE8uv_NEON;
1643
  VP8PredChroma8[4] = DC8uvNoTop_NEON;
1644
  VP8PredChroma8[5] = DC8uvNoLeft_NEON;
1645
  VP8PredChroma8[6] = DC8uvNoTopLeft_NEON;
1646
}
1647

1648
#else  // !WEBP_USE_NEON
1649

1650
WEBP_DSP_INIT_STUB(VP8DspInitNEON)
1651

1652
#endif  // WEBP_USE_NEON
1653

1654
Product

Resources

Company