Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
godotengine
GitHub Repository: godotengine/godot
Path: blob/master/thirdparty/libwebp/src/dsp/dec_neon.c
9913 views
1
// Copyright 2012 Google Inc. All Rights Reserved.
2
//
3
// Use of this source code is governed by a BSD-style license
4
// that can be found in the COPYING file in the root of the source
5
// tree. An additional intellectual property rights grant can be found
6
// in the file PATENTS. All contributing project authors may
7
// be found in the AUTHORS file in the root of the source tree.
8
// -----------------------------------------------------------------------------
9
//
10
// ARM NEON version of dsp functions and loop filtering.
11
//
12
// Authors: Somnath Banerjee ([email protected])
13
// Johann Koenig ([email protected])
14
15
#include "src/dsp/dsp.h"
16
17
#if defined(WEBP_USE_NEON)
18
19
#include "src/dsp/neon.h"
20
#include "src/dec/vp8i_dec.h"
21
22
//------------------------------------------------------------------------------
23
// NxM Loading functions
24
25
#if !defined(WORK_AROUND_GCC)
26
27
// This intrinsics version makes gcc-4.6.3 crash during Load4x??() compilation
28
// (register alloc, probably). The variants somewhat mitigate the problem, but
29
// not quite. HFilter16i() remains problematic.
30
static WEBP_INLINE uint8x8x4_t Load4x8_NEON(const uint8_t* const src,
31
int stride) {
32
const uint8x8_t zero = vdup_n_u8(0);
33
uint8x8x4_t out;
34
INIT_VECTOR4(out, zero, zero, zero, zero);
35
out = vld4_lane_u8(src + 0 * stride, out, 0);
36
out = vld4_lane_u8(src + 1 * stride, out, 1);
37
out = vld4_lane_u8(src + 2 * stride, out, 2);
38
out = vld4_lane_u8(src + 3 * stride, out, 3);
39
out = vld4_lane_u8(src + 4 * stride, out, 4);
40
out = vld4_lane_u8(src + 5 * stride, out, 5);
41
out = vld4_lane_u8(src + 6 * stride, out, 6);
42
out = vld4_lane_u8(src + 7 * stride, out, 7);
43
return out;
44
}
45
46
static WEBP_INLINE void Load4x16_NEON(const uint8_t* const src, int stride,
47
uint8x16_t* const p1,
48
uint8x16_t* const p0,
49
uint8x16_t* const q0,
50
uint8x16_t* const q1) {
51
// row0 = p1[0..7]|p0[0..7]|q0[0..7]|q1[0..7]
52
// row8 = p1[8..15]|p0[8..15]|q0[8..15]|q1[8..15]
53
const uint8x8x4_t row0 = Load4x8_NEON(src - 2 + 0 * stride, stride);
54
const uint8x8x4_t row8 = Load4x8_NEON(src - 2 + 8 * stride, stride);
55
*p1 = vcombine_u8(row0.val[0], row8.val[0]);
56
*p0 = vcombine_u8(row0.val[1], row8.val[1]);
57
*q0 = vcombine_u8(row0.val[2], row8.val[2]);
58
*q1 = vcombine_u8(row0.val[3], row8.val[3]);
59
}
60
61
#else // WORK_AROUND_GCC
62
63
#define LOADQ_LANE_32b(VALUE, LANE) do { \
64
(VALUE) = vld1q_lane_u32((const uint32_t*)src, (VALUE), (LANE)); \
65
src += stride; \
66
} while (0)
67
68
static WEBP_INLINE void Load4x16_NEON(const uint8_t* src, int stride,
69
uint8x16_t* const p1,
70
uint8x16_t* const p0,
71
uint8x16_t* const q0,
72
uint8x16_t* const q1) {
73
const uint32x4_t zero = vdupq_n_u32(0);
74
uint32x4x4_t in;
75
INIT_VECTOR4(in, zero, zero, zero, zero);
76
src -= 2;
77
LOADQ_LANE_32b(in.val[0], 0);
78
LOADQ_LANE_32b(in.val[1], 0);
79
LOADQ_LANE_32b(in.val[2], 0);
80
LOADQ_LANE_32b(in.val[3], 0);
81
LOADQ_LANE_32b(in.val[0], 1);
82
LOADQ_LANE_32b(in.val[1], 1);
83
LOADQ_LANE_32b(in.val[2], 1);
84
LOADQ_LANE_32b(in.val[3], 1);
85
LOADQ_LANE_32b(in.val[0], 2);
86
LOADQ_LANE_32b(in.val[1], 2);
87
LOADQ_LANE_32b(in.val[2], 2);
88
LOADQ_LANE_32b(in.val[3], 2);
89
LOADQ_LANE_32b(in.val[0], 3);
90
LOADQ_LANE_32b(in.val[1], 3);
91
LOADQ_LANE_32b(in.val[2], 3);
92
LOADQ_LANE_32b(in.val[3], 3);
93
// Transpose four 4x4 parts:
94
{
95
const uint8x16x2_t row01 = vtrnq_u8(vreinterpretq_u8_u32(in.val[0]),
96
vreinterpretq_u8_u32(in.val[1]));
97
const uint8x16x2_t row23 = vtrnq_u8(vreinterpretq_u8_u32(in.val[2]),
98
vreinterpretq_u8_u32(in.val[3]));
99
const uint16x8x2_t row02 = vtrnq_u16(vreinterpretq_u16_u8(row01.val[0]),
100
vreinterpretq_u16_u8(row23.val[0]));
101
const uint16x8x2_t row13 = vtrnq_u16(vreinterpretq_u16_u8(row01.val[1]),
102
vreinterpretq_u16_u8(row23.val[1]));
103
*p1 = vreinterpretq_u8_u16(row02.val[0]);
104
*p0 = vreinterpretq_u8_u16(row13.val[0]);
105
*q0 = vreinterpretq_u8_u16(row02.val[1]);
106
*q1 = vreinterpretq_u8_u16(row13.val[1]);
107
}
108
}
109
#undef LOADQ_LANE_32b
110
111
#endif // !WORK_AROUND_GCC
112
113
static WEBP_INLINE void Load8x16_NEON(
114
const uint8_t* const src, int stride,
115
uint8x16_t* const p3, uint8x16_t* const p2, uint8x16_t* const p1,
116
uint8x16_t* const p0, uint8x16_t* const q0, uint8x16_t* const q1,
117
uint8x16_t* const q2, uint8x16_t* const q3) {
118
Load4x16_NEON(src - 2, stride, p3, p2, p1, p0);
119
Load4x16_NEON(src + 2, stride, q0, q1, q2, q3);
120
}
121
122
static WEBP_INLINE void Load16x4_NEON(const uint8_t* const src, int stride,
123
uint8x16_t* const p1,
124
uint8x16_t* const p0,
125
uint8x16_t* const q0,
126
uint8x16_t* const q1) {
127
*p1 = vld1q_u8(src - 2 * stride);
128
*p0 = vld1q_u8(src - 1 * stride);
129
*q0 = vld1q_u8(src + 0 * stride);
130
*q1 = vld1q_u8(src + 1 * stride);
131
}
132
133
static WEBP_INLINE void Load16x8_NEON(
134
const uint8_t* const src, int stride,
135
uint8x16_t* const p3, uint8x16_t* const p2, uint8x16_t* const p1,
136
uint8x16_t* const p0, uint8x16_t* const q0, uint8x16_t* const q1,
137
uint8x16_t* const q2, uint8x16_t* const q3) {
138
Load16x4_NEON(src - 2 * stride, stride, p3, p2, p1, p0);
139
Load16x4_NEON(src + 2 * stride, stride, q0, q1, q2, q3);
140
}
141
142
static WEBP_INLINE void Load8x8x2_NEON(
143
const uint8_t* const u, const uint8_t* const v, int stride,
144
uint8x16_t* const p3, uint8x16_t* const p2, uint8x16_t* const p1,
145
uint8x16_t* const p0, uint8x16_t* const q0, uint8x16_t* const q1,
146
uint8x16_t* const q2, uint8x16_t* const q3) {
147
// We pack the 8x8 u-samples in the lower half of the uint8x16_t destination
148
// and the v-samples on the higher half.
149
*p3 = vcombine_u8(vld1_u8(u - 4 * stride), vld1_u8(v - 4 * stride));
150
*p2 = vcombine_u8(vld1_u8(u - 3 * stride), vld1_u8(v - 3 * stride));
151
*p1 = vcombine_u8(vld1_u8(u - 2 * stride), vld1_u8(v - 2 * stride));
152
*p0 = vcombine_u8(vld1_u8(u - 1 * stride), vld1_u8(v - 1 * stride));
153
*q0 = vcombine_u8(vld1_u8(u + 0 * stride), vld1_u8(v + 0 * stride));
154
*q1 = vcombine_u8(vld1_u8(u + 1 * stride), vld1_u8(v + 1 * stride));
155
*q2 = vcombine_u8(vld1_u8(u + 2 * stride), vld1_u8(v + 2 * stride));
156
*q3 = vcombine_u8(vld1_u8(u + 3 * stride), vld1_u8(v + 3 * stride));
157
}
158
159
#if !defined(WORK_AROUND_GCC)
160
161
#define LOAD_UV_8(ROW) \
162
vcombine_u8(vld1_u8(u - 4 + (ROW) * stride), vld1_u8(v - 4 + (ROW) * stride))
163
164
static WEBP_INLINE void Load8x8x2T_NEON(
165
const uint8_t* const u, const uint8_t* const v, int stride,
166
uint8x16_t* const p3, uint8x16_t* const p2, uint8x16_t* const p1,
167
uint8x16_t* const p0, uint8x16_t* const q0, uint8x16_t* const q1,
168
uint8x16_t* const q2, uint8x16_t* const q3) {
169
// We pack the 8x8 u-samples in the lower half of the uint8x16_t destination
170
// and the v-samples on the higher half.
171
const uint8x16_t row0 = LOAD_UV_8(0);
172
const uint8x16_t row1 = LOAD_UV_8(1);
173
const uint8x16_t row2 = LOAD_UV_8(2);
174
const uint8x16_t row3 = LOAD_UV_8(3);
175
const uint8x16_t row4 = LOAD_UV_8(4);
176
const uint8x16_t row5 = LOAD_UV_8(5);
177
const uint8x16_t row6 = LOAD_UV_8(6);
178
const uint8x16_t row7 = LOAD_UV_8(7);
179
// Perform two side-by-side 8x8 transposes
180
// u00 u01 u02 u03 u04 u05 u06 u07 | v00 v01 v02 v03 v04 v05 v06 v07
181
// u10 u11 u12 u13 u14 u15 u16 u17 | v10 v11 v12 ...
182
// u20 u21 u22 u23 u24 u25 u26 u27 | v20 v21 ...
183
// u30 u31 u32 u33 u34 u35 u36 u37 | ...
184
// u40 u41 u42 u43 u44 u45 u46 u47 | ...
185
// u50 u51 u52 u53 u54 u55 u56 u57 | ...
186
// u60 u61 u62 u63 u64 u65 u66 u67 | v60 ...
187
// u70 u71 u72 u73 u74 u75 u76 u77 | v70 v71 v72 ...
188
const uint8x16x2_t row01 = vtrnq_u8(row0, row1); // u00 u10 u02 u12 ...
189
// u01 u11 u03 u13 ...
190
const uint8x16x2_t row23 = vtrnq_u8(row2, row3); // u20 u30 u22 u32 ...
191
// u21 u31 u23 u33 ...
192
const uint8x16x2_t row45 = vtrnq_u8(row4, row5); // ...
193
const uint8x16x2_t row67 = vtrnq_u8(row6, row7); // ...
194
const uint16x8x2_t row02 = vtrnq_u16(vreinterpretq_u16_u8(row01.val[0]),
195
vreinterpretq_u16_u8(row23.val[0]));
196
const uint16x8x2_t row13 = vtrnq_u16(vreinterpretq_u16_u8(row01.val[1]),
197
vreinterpretq_u16_u8(row23.val[1]));
198
const uint16x8x2_t row46 = vtrnq_u16(vreinterpretq_u16_u8(row45.val[0]),
199
vreinterpretq_u16_u8(row67.val[0]));
200
const uint16x8x2_t row57 = vtrnq_u16(vreinterpretq_u16_u8(row45.val[1]),
201
vreinterpretq_u16_u8(row67.val[1]));
202
const uint32x4x2_t row04 = vtrnq_u32(vreinterpretq_u32_u16(row02.val[0]),
203
vreinterpretq_u32_u16(row46.val[0]));
204
const uint32x4x2_t row26 = vtrnq_u32(vreinterpretq_u32_u16(row02.val[1]),
205
vreinterpretq_u32_u16(row46.val[1]));
206
const uint32x4x2_t row15 = vtrnq_u32(vreinterpretq_u32_u16(row13.val[0]),
207
vreinterpretq_u32_u16(row57.val[0]));
208
const uint32x4x2_t row37 = vtrnq_u32(vreinterpretq_u32_u16(row13.val[1]),
209
vreinterpretq_u32_u16(row57.val[1]));
210
*p3 = vreinterpretq_u8_u32(row04.val[0]);
211
*p2 = vreinterpretq_u8_u32(row15.val[0]);
212
*p1 = vreinterpretq_u8_u32(row26.val[0]);
213
*p0 = vreinterpretq_u8_u32(row37.val[0]);
214
*q0 = vreinterpretq_u8_u32(row04.val[1]);
215
*q1 = vreinterpretq_u8_u32(row15.val[1]);
216
*q2 = vreinterpretq_u8_u32(row26.val[1]);
217
*q3 = vreinterpretq_u8_u32(row37.val[1]);
218
}
219
#undef LOAD_UV_8
220
221
#endif // !WORK_AROUND_GCC
222
223
static WEBP_INLINE void Store2x8_NEON(const uint8x8x2_t v,
224
uint8_t* const dst, int stride) {
225
vst2_lane_u8(dst + 0 * stride, v, 0);
226
vst2_lane_u8(dst + 1 * stride, v, 1);
227
vst2_lane_u8(dst + 2 * stride, v, 2);
228
vst2_lane_u8(dst + 3 * stride, v, 3);
229
vst2_lane_u8(dst + 4 * stride, v, 4);
230
vst2_lane_u8(dst + 5 * stride, v, 5);
231
vst2_lane_u8(dst + 6 * stride, v, 6);
232
vst2_lane_u8(dst + 7 * stride, v, 7);
233
}
234
235
static WEBP_INLINE void Store2x16_NEON(const uint8x16_t p0, const uint8x16_t q0,
236
uint8_t* const dst, int stride) {
237
uint8x8x2_t lo, hi;
238
lo.val[0] = vget_low_u8(p0);
239
lo.val[1] = vget_low_u8(q0);
240
hi.val[0] = vget_high_u8(p0);
241
hi.val[1] = vget_high_u8(q0);
242
Store2x8_NEON(lo, dst - 1 + 0 * stride, stride);
243
Store2x8_NEON(hi, dst - 1 + 8 * stride, stride);
244
}
245
246
#if !defined(WORK_AROUND_GCC)
247
static WEBP_INLINE void Store4x8_NEON(const uint8x8x4_t v,
248
uint8_t* const dst, int stride) {
249
vst4_lane_u8(dst + 0 * stride, v, 0);
250
vst4_lane_u8(dst + 1 * stride, v, 1);
251
vst4_lane_u8(dst + 2 * stride, v, 2);
252
vst4_lane_u8(dst + 3 * stride, v, 3);
253
vst4_lane_u8(dst + 4 * stride, v, 4);
254
vst4_lane_u8(dst + 5 * stride, v, 5);
255
vst4_lane_u8(dst + 6 * stride, v, 6);
256
vst4_lane_u8(dst + 7 * stride, v, 7);
257
}
258
259
static WEBP_INLINE void Store4x16_NEON(const uint8x16_t p1, const uint8x16_t p0,
260
const uint8x16_t q0, const uint8x16_t q1,
261
uint8_t* const dst, int stride) {
262
uint8x8x4_t lo, hi;
263
INIT_VECTOR4(lo,
264
vget_low_u8(p1), vget_low_u8(p0),
265
vget_low_u8(q0), vget_low_u8(q1));
266
INIT_VECTOR4(hi,
267
vget_high_u8(p1), vget_high_u8(p0),
268
vget_high_u8(q0), vget_high_u8(q1));
269
Store4x8_NEON(lo, dst - 2 + 0 * stride, stride);
270
Store4x8_NEON(hi, dst - 2 + 8 * stride, stride);
271
}
272
#endif // !WORK_AROUND_GCC
273
274
static WEBP_INLINE void Store16x2_NEON(const uint8x16_t p0, const uint8x16_t q0,
275
uint8_t* const dst, int stride) {
276
vst1q_u8(dst - stride, p0);
277
vst1q_u8(dst, q0);
278
}
279
280
static WEBP_INLINE void Store16x4_NEON(const uint8x16_t p1, const uint8x16_t p0,
281
const uint8x16_t q0, const uint8x16_t q1,
282
uint8_t* const dst, int stride) {
283
Store16x2_NEON(p1, p0, dst - stride, stride);
284
Store16x2_NEON(q0, q1, dst + stride, stride);
285
}
286
287
static WEBP_INLINE void Store8x2x2_NEON(const uint8x16_t p0,
288
const uint8x16_t q0,
289
uint8_t* const u, uint8_t* const v,
290
int stride) {
291
// p0 and q0 contain the u+v samples packed in low/high halves.
292
vst1_u8(u - stride, vget_low_u8(p0));
293
vst1_u8(u, vget_low_u8(q0));
294
vst1_u8(v - stride, vget_high_u8(p0));
295
vst1_u8(v, vget_high_u8(q0));
296
}
297
298
static WEBP_INLINE void Store8x4x2_NEON(const uint8x16_t p1,
299
const uint8x16_t p0,
300
const uint8x16_t q0,
301
const uint8x16_t q1,
302
uint8_t* const u, uint8_t* const v,
303
int stride) {
304
// The p1...q1 registers contain the u+v samples packed in low/high halves.
305
Store8x2x2_NEON(p1, p0, u - stride, v - stride, stride);
306
Store8x2x2_NEON(q0, q1, u + stride, v + stride, stride);
307
}
308
309
#if !defined(WORK_AROUND_GCC)
310
311
#define STORE6_LANE(DST, VAL0, VAL1, LANE) do { \
312
vst3_lane_u8((DST) - 3, (VAL0), (LANE)); \
313
vst3_lane_u8((DST) + 0, (VAL1), (LANE)); \
314
(DST) += stride; \
315
} while (0)
316
317
static WEBP_INLINE void Store6x8x2_NEON(
318
const uint8x16_t p2, const uint8x16_t p1, const uint8x16_t p0,
319
const uint8x16_t q0, const uint8x16_t q1, const uint8x16_t q2,
320
uint8_t* u, uint8_t* v, int stride) {
321
uint8x8x3_t u0, u1, v0, v1;
322
INIT_VECTOR3(u0, vget_low_u8(p2), vget_low_u8(p1), vget_low_u8(p0));
323
INIT_VECTOR3(u1, vget_low_u8(q0), vget_low_u8(q1), vget_low_u8(q2));
324
INIT_VECTOR3(v0, vget_high_u8(p2), vget_high_u8(p1), vget_high_u8(p0));
325
INIT_VECTOR3(v1, vget_high_u8(q0), vget_high_u8(q1), vget_high_u8(q2));
326
STORE6_LANE(u, u0, u1, 0);
327
STORE6_LANE(u, u0, u1, 1);
328
STORE6_LANE(u, u0, u1, 2);
329
STORE6_LANE(u, u0, u1, 3);
330
STORE6_LANE(u, u0, u1, 4);
331
STORE6_LANE(u, u0, u1, 5);
332
STORE6_LANE(u, u0, u1, 6);
333
STORE6_LANE(u, u0, u1, 7);
334
STORE6_LANE(v, v0, v1, 0);
335
STORE6_LANE(v, v0, v1, 1);
336
STORE6_LANE(v, v0, v1, 2);
337
STORE6_LANE(v, v0, v1, 3);
338
STORE6_LANE(v, v0, v1, 4);
339
STORE6_LANE(v, v0, v1, 5);
340
STORE6_LANE(v, v0, v1, 6);
341
STORE6_LANE(v, v0, v1, 7);
342
}
343
#undef STORE6_LANE
344
345
static WEBP_INLINE void Store4x8x2_NEON(const uint8x16_t p1,
346
const uint8x16_t p0,
347
const uint8x16_t q0,
348
const uint8x16_t q1,
349
uint8_t* const u, uint8_t* const v,
350
int stride) {
351
uint8x8x4_t u0, v0;
352
INIT_VECTOR4(u0,
353
vget_low_u8(p1), vget_low_u8(p0),
354
vget_low_u8(q0), vget_low_u8(q1));
355
INIT_VECTOR4(v0,
356
vget_high_u8(p1), vget_high_u8(p0),
357
vget_high_u8(q0), vget_high_u8(q1));
358
vst4_lane_u8(u - 2 + 0 * stride, u0, 0);
359
vst4_lane_u8(u - 2 + 1 * stride, u0, 1);
360
vst4_lane_u8(u - 2 + 2 * stride, u0, 2);
361
vst4_lane_u8(u - 2 + 3 * stride, u0, 3);
362
vst4_lane_u8(u - 2 + 4 * stride, u0, 4);
363
vst4_lane_u8(u - 2 + 5 * stride, u0, 5);
364
vst4_lane_u8(u - 2 + 6 * stride, u0, 6);
365
vst4_lane_u8(u - 2 + 7 * stride, u0, 7);
366
vst4_lane_u8(v - 2 + 0 * stride, v0, 0);
367
vst4_lane_u8(v - 2 + 1 * stride, v0, 1);
368
vst4_lane_u8(v - 2 + 2 * stride, v0, 2);
369
vst4_lane_u8(v - 2 + 3 * stride, v0, 3);
370
vst4_lane_u8(v - 2 + 4 * stride, v0, 4);
371
vst4_lane_u8(v - 2 + 5 * stride, v0, 5);
372
vst4_lane_u8(v - 2 + 6 * stride, v0, 6);
373
vst4_lane_u8(v - 2 + 7 * stride, v0, 7);
374
}
375
376
#endif // !WORK_AROUND_GCC
377
378
// Zero extend 'v' to an int16x8_t.
379
static WEBP_INLINE int16x8_t ConvertU8ToS16_NEON(uint8x8_t v) {
380
return vreinterpretq_s16_u16(vmovl_u8(v));
381
}
382
383
// Performs unsigned 8b saturation on 'dst01' and 'dst23' storing the result
384
// to the corresponding rows of 'dst'.
385
static WEBP_INLINE void SaturateAndStore4x4_NEON(uint8_t* const dst,
386
const int16x8_t dst01,
387
const int16x8_t dst23) {
388
// Unsigned saturate to 8b.
389
const uint8x8_t dst01_u8 = vqmovun_s16(dst01);
390
const uint8x8_t dst23_u8 = vqmovun_s16(dst23);
391
392
// Store the results.
393
vst1_lane_u32((uint32_t*)(dst + 0 * BPS), vreinterpret_u32_u8(dst01_u8), 0);
394
vst1_lane_u32((uint32_t*)(dst + 1 * BPS), vreinterpret_u32_u8(dst01_u8), 1);
395
vst1_lane_u32((uint32_t*)(dst + 2 * BPS), vreinterpret_u32_u8(dst23_u8), 0);
396
vst1_lane_u32((uint32_t*)(dst + 3 * BPS), vreinterpret_u32_u8(dst23_u8), 1);
397
}
398
399
static WEBP_INLINE void Add4x4_NEON(const int16x8_t row01,
400
const int16x8_t row23,
401
uint8_t* const dst) {
402
uint32x2_t dst01 = vdup_n_u32(0);
403
uint32x2_t dst23 = vdup_n_u32(0);
404
405
// Load the source pixels.
406
dst01 = vld1_lane_u32((uint32_t*)(dst + 0 * BPS), dst01, 0);
407
dst23 = vld1_lane_u32((uint32_t*)(dst + 2 * BPS), dst23, 0);
408
dst01 = vld1_lane_u32((uint32_t*)(dst + 1 * BPS), dst01, 1);
409
dst23 = vld1_lane_u32((uint32_t*)(dst + 3 * BPS), dst23, 1);
410
411
{
412
// Convert to 16b.
413
const int16x8_t dst01_s16 = ConvertU8ToS16_NEON(vreinterpret_u8_u32(dst01));
414
const int16x8_t dst23_s16 = ConvertU8ToS16_NEON(vreinterpret_u8_u32(dst23));
415
416
// Descale with rounding.
417
const int16x8_t out01 = vrsraq_n_s16(dst01_s16, row01, 3);
418
const int16x8_t out23 = vrsraq_n_s16(dst23_s16, row23, 3);
419
// Add the inverse transform.
420
SaturateAndStore4x4_NEON(dst, out01, out23);
421
}
422
}
423
424
//-----------------------------------------------------------------------------
425
// Simple In-loop filtering (Paragraph 15.2)
426
427
static uint8x16_t NeedsFilter_NEON(const uint8x16_t p1, const uint8x16_t p0,
428
const uint8x16_t q0, const uint8x16_t q1,
429
int thresh) {
430
const uint8x16_t thresh_v = vdupq_n_u8((uint8_t)thresh);
431
const uint8x16_t a_p0_q0 = vabdq_u8(p0, q0); // abs(p0-q0)
432
const uint8x16_t a_p1_q1 = vabdq_u8(p1, q1); // abs(p1-q1)
433
const uint8x16_t a_p0_q0_2 = vqaddq_u8(a_p0_q0, a_p0_q0); // 2 * abs(p0-q0)
434
const uint8x16_t a_p1_q1_2 = vshrq_n_u8(a_p1_q1, 1); // abs(p1-q1) / 2
435
const uint8x16_t sum = vqaddq_u8(a_p0_q0_2, a_p1_q1_2);
436
const uint8x16_t mask = vcgeq_u8(thresh_v, sum);
437
return mask;
438
}
439
440
static int8x16_t FlipSign_NEON(const uint8x16_t v) {
441
const uint8x16_t sign_bit = vdupq_n_u8(0x80);
442
return vreinterpretq_s8_u8(veorq_u8(v, sign_bit));
443
}
444
445
static uint8x16_t FlipSignBack_NEON(const int8x16_t v) {
446
const int8x16_t sign_bit = vdupq_n_s8(0x80);
447
return vreinterpretq_u8_s8(veorq_s8(v, sign_bit));
448
}
449
450
static int8x16_t GetBaseDelta_NEON(const int8x16_t p1, const int8x16_t p0,
451
const int8x16_t q0, const int8x16_t q1) {
452
const int8x16_t q0_p0 = vqsubq_s8(q0, p0); // (q0-p0)
453
const int8x16_t p1_q1 = vqsubq_s8(p1, q1); // (p1-q1)
454
const int8x16_t s1 = vqaddq_s8(p1_q1, q0_p0); // (p1-q1) + 1 * (q0 - p0)
455
const int8x16_t s2 = vqaddq_s8(q0_p0, s1); // (p1-q1) + 2 * (q0 - p0)
456
const int8x16_t s3 = vqaddq_s8(q0_p0, s2); // (p1-q1) + 3 * (q0 - p0)
457
return s3;
458
}
459
460
static int8x16_t GetBaseDelta0_NEON(const int8x16_t p0, const int8x16_t q0) {
461
const int8x16_t q0_p0 = vqsubq_s8(q0, p0); // (q0-p0)
462
const int8x16_t s1 = vqaddq_s8(q0_p0, q0_p0); // 2 * (q0 - p0)
463
const int8x16_t s2 = vqaddq_s8(q0_p0, s1); // 3 * (q0 - p0)
464
return s2;
465
}
466
467
//------------------------------------------------------------------------------
468
469
static void ApplyFilter2NoFlip_NEON(const int8x16_t p0s, const int8x16_t q0s,
470
const int8x16_t delta,
471
int8x16_t* const op0,
472
int8x16_t* const oq0) {
473
const int8x16_t kCst3 = vdupq_n_s8(0x03);
474
const int8x16_t kCst4 = vdupq_n_s8(0x04);
475
const int8x16_t delta_p3 = vqaddq_s8(delta, kCst3);
476
const int8x16_t delta_p4 = vqaddq_s8(delta, kCst4);
477
const int8x16_t delta3 = vshrq_n_s8(delta_p3, 3);
478
const int8x16_t delta4 = vshrq_n_s8(delta_p4, 3);
479
*op0 = vqaddq_s8(p0s, delta3);
480
*oq0 = vqsubq_s8(q0s, delta4);
481
}
482
483
#if defined(WEBP_USE_INTRINSICS)
484
485
static void ApplyFilter2_NEON(const int8x16_t p0s, const int8x16_t q0s,
486
const int8x16_t delta,
487
uint8x16_t* const op0, uint8x16_t* const oq0) {
488
const int8x16_t kCst3 = vdupq_n_s8(0x03);
489
const int8x16_t kCst4 = vdupq_n_s8(0x04);
490
const int8x16_t delta_p3 = vqaddq_s8(delta, kCst3);
491
const int8x16_t delta_p4 = vqaddq_s8(delta, kCst4);
492
const int8x16_t delta3 = vshrq_n_s8(delta_p3, 3);
493
const int8x16_t delta4 = vshrq_n_s8(delta_p4, 3);
494
const int8x16_t sp0 = vqaddq_s8(p0s, delta3);
495
const int8x16_t sq0 = vqsubq_s8(q0s, delta4);
496
*op0 = FlipSignBack_NEON(sp0);
497
*oq0 = FlipSignBack_NEON(sq0);
498
}
499
500
static void DoFilter2_NEON(const uint8x16_t p1, const uint8x16_t p0,
501
const uint8x16_t q0, const uint8x16_t q1,
502
const uint8x16_t mask,
503
uint8x16_t* const op0, uint8x16_t* const oq0) {
504
const int8x16_t p1s = FlipSign_NEON(p1);
505
const int8x16_t p0s = FlipSign_NEON(p0);
506
const int8x16_t q0s = FlipSign_NEON(q0);
507
const int8x16_t q1s = FlipSign_NEON(q1);
508
const int8x16_t delta0 = GetBaseDelta_NEON(p1s, p0s, q0s, q1s);
509
const int8x16_t delta1 = vandq_s8(delta0, vreinterpretq_s8_u8(mask));
510
ApplyFilter2_NEON(p0s, q0s, delta1, op0, oq0);
511
}
512
513
static void SimpleVFilter16_NEON(uint8_t* p, int stride, int thresh) {
514
uint8x16_t p1, p0, q0, q1, op0, oq0;
515
Load16x4_NEON(p, stride, &p1, &p0, &q0, &q1);
516
{
517
const uint8x16_t mask = NeedsFilter_NEON(p1, p0, q0, q1, thresh);
518
DoFilter2_NEON(p1, p0, q0, q1, mask, &op0, &oq0);
519
}
520
Store16x2_NEON(op0, oq0, p, stride);
521
}
522
523
static void SimpleHFilter16_NEON(uint8_t* p, int stride, int thresh) {
524
uint8x16_t p1, p0, q0, q1, oq0, op0;
525
Load4x16_NEON(p, stride, &p1, &p0, &q0, &q1);
526
{
527
const uint8x16_t mask = NeedsFilter_NEON(p1, p0, q0, q1, thresh);
528
DoFilter2_NEON(p1, p0, q0, q1, mask, &op0, &oq0);
529
}
530
Store2x16_NEON(op0, oq0, p, stride);
531
}
532
533
#else
534
535
// Load/Store vertical edge
536
#define LOAD8x4(c1, c2, c3, c4, b1, b2, stride) \
537
"vld4.8 {" #c1 "[0]," #c2 "[0]," #c3 "[0]," #c4 "[0]}," #b1 "," #stride "\n" \
538
"vld4.8 {" #c1 "[1]," #c2 "[1]," #c3 "[1]," #c4 "[1]}," #b2 "," #stride "\n" \
539
"vld4.8 {" #c1 "[2]," #c2 "[2]," #c3 "[2]," #c4 "[2]}," #b1 "," #stride "\n" \
540
"vld4.8 {" #c1 "[3]," #c2 "[3]," #c3 "[3]," #c4 "[3]}," #b2 "," #stride "\n" \
541
"vld4.8 {" #c1 "[4]," #c2 "[4]," #c3 "[4]," #c4 "[4]}," #b1 "," #stride "\n" \
542
"vld4.8 {" #c1 "[5]," #c2 "[5]," #c3 "[5]," #c4 "[5]}," #b2 "," #stride "\n" \
543
"vld4.8 {" #c1 "[6]," #c2 "[6]," #c3 "[6]," #c4 "[6]}," #b1 "," #stride "\n" \
544
"vld4.8 {" #c1 "[7]," #c2 "[7]," #c3 "[7]," #c4 "[7]}," #b2 "," #stride "\n"
545
546
#define STORE8x2(c1, c2, p, stride) \
547
"vst2.8 {" #c1 "[0], " #c2 "[0]}," #p "," #stride " \n" \
548
"vst2.8 {" #c1 "[1], " #c2 "[1]}," #p "," #stride " \n" \
549
"vst2.8 {" #c1 "[2], " #c2 "[2]}," #p "," #stride " \n" \
550
"vst2.8 {" #c1 "[3], " #c2 "[3]}," #p "," #stride " \n" \
551
"vst2.8 {" #c1 "[4], " #c2 "[4]}," #p "," #stride " \n" \
552
"vst2.8 {" #c1 "[5], " #c2 "[5]}," #p "," #stride " \n" \
553
"vst2.8 {" #c1 "[6], " #c2 "[6]}," #p "," #stride " \n" \
554
"vst2.8 {" #c1 "[7], " #c2 "[7]}," #p "," #stride " \n"
555
556
#define QRegs "q0", "q1", "q2", "q3", \
557
"q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
558
559
#define FLIP_SIGN_BIT2(a, b, s) \
560
"veor " #a "," #a "," #s " \n" \
561
"veor " #b "," #b "," #s " \n" \
562
563
#define FLIP_SIGN_BIT4(a, b, c, d, s) \
564
FLIP_SIGN_BIT2(a, b, s) \
565
FLIP_SIGN_BIT2(c, d, s) \
566
567
#define NEEDS_FILTER(p1, p0, q0, q1, thresh, mask) \
568
"vabd.u8 q15," #p0 "," #q0 " \n" /* abs(p0 - q0) */ \
569
"vabd.u8 q14," #p1 "," #q1 " \n" /* abs(p1 - q1) */ \
570
"vqadd.u8 q15, q15, q15 \n" /* abs(p0 - q0) * 2 */ \
571
"vshr.u8 q14, q14, #1 \n" /* abs(p1 - q1) / 2 */ \
572
"vqadd.u8 q15, q15, q14 \n" /* abs(p0 - q0) * 2 + abs(p1 - q1) / 2 */ \
573
"vdup.8 q14, " #thresh " \n" \
574
"vcge.u8 " #mask ", q14, q15 \n" /* mask <= thresh */
575
576
#define GET_BASE_DELTA(p1, p0, q0, q1, o) \
577
"vqsub.s8 q15," #q0 "," #p0 " \n" /* (q0 - p0) */ \
578
"vqsub.s8 " #o "," #p1 "," #q1 " \n" /* (p1 - q1) */ \
579
"vqadd.s8 " #o "," #o ", q15 \n" /* (p1 - q1) + 1 * (p0 - q0) */ \
580
"vqadd.s8 " #o "," #o ", q15 \n" /* (p1 - q1) + 2 * (p0 - q0) */ \
581
"vqadd.s8 " #o "," #o ", q15 \n" /* (p1 - q1) + 3 * (p0 - q0) */
582
583
#define DO_SIMPLE_FILTER(p0, q0, fl) \
584
"vmov.i8 q15, #0x03 \n" \
585
"vqadd.s8 q15, q15, " #fl " \n" /* filter1 = filter + 3 */ \
586
"vshr.s8 q15, q15, #3 \n" /* filter1 >> 3 */ \
587
"vqadd.s8 " #p0 "," #p0 ", q15 \n" /* p0 += filter1 */ \
588
\
589
"vmov.i8 q15, #0x04 \n" \
590
"vqadd.s8 q15, q15, " #fl " \n" /* filter1 = filter + 4 */ \
591
"vshr.s8 q15, q15, #3 \n" /* filter2 >> 3 */ \
592
"vqsub.s8 " #q0 "," #q0 ", q15 \n" /* q0 -= filter2 */
593
594
// Applies filter on 2 pixels (p0 and q0)
595
#define DO_FILTER2(p1, p0, q0, q1, thresh) \
596
NEEDS_FILTER(p1, p0, q0, q1, thresh, q9) /* filter mask in q9 */ \
597
"vmov.i8 q10, #0x80 \n" /* sign bit */ \
598
FLIP_SIGN_BIT4(p1, p0, q0, q1, q10) /* convert to signed value */ \
599
GET_BASE_DELTA(p1, p0, q0, q1, q11) /* get filter level */ \
600
"vand q9, q9, q11 \n" /* apply filter mask */ \
601
DO_SIMPLE_FILTER(p0, q0, q9) /* apply filter */ \
602
FLIP_SIGN_BIT2(p0, q0, q10)
603
604
static void SimpleVFilter16_NEON(uint8_t* p, int stride, int thresh) {
605
__asm__ volatile (
606
"sub %[p], %[p], %[stride], lsl #1 \n" // p -= 2 * stride
607
608
"vld1.u8 {q1}, [%[p]], %[stride] \n" // p1
609
"vld1.u8 {q2}, [%[p]], %[stride] \n" // p0
610
"vld1.u8 {q3}, [%[p]], %[stride] \n" // q0
611
"vld1.u8 {q12}, [%[p]] \n" // q1
612
613
DO_FILTER2(q1, q2, q3, q12, %[thresh])
614
615
"sub %[p], %[p], %[stride], lsl #1 \n" // p -= 2 * stride
616
617
"vst1.u8 {q2}, [%[p]], %[stride] \n" // store op0
618
"vst1.u8 {q3}, [%[p]] \n" // store oq0
619
: [p] "+r"(p)
620
: [stride] "r"(stride), [thresh] "r"(thresh)
621
: "memory", QRegs
622
);
623
}
624
625
static void SimpleHFilter16_NEON(uint8_t* p, int stride, int thresh) {
626
__asm__ volatile (
627
"sub r4, %[p], #2 \n" // base1 = p - 2
628
"lsl r6, %[stride], #1 \n" // r6 = 2 * stride
629
"add r5, r4, %[stride] \n" // base2 = base1 + stride
630
631
LOAD8x4(d2, d3, d4, d5, [r4], [r5], r6)
632
LOAD8x4(d24, d25, d26, d27, [r4], [r5], r6)
633
"vswp d3, d24 \n" // p1:q1 p0:q3
634
"vswp d5, d26 \n" // q0:q2 q1:q4
635
"vswp q2, q12 \n" // p1:q1 p0:q2 q0:q3 q1:q4
636
637
DO_FILTER2(q1, q2, q12, q13, %[thresh])
638
639
"sub %[p], %[p], #1 \n" // p - 1
640
641
"vswp d5, d24 \n"
642
STORE8x2(d4, d5, [%[p]], %[stride])
643
STORE8x2(d24, d25, [%[p]], %[stride])
644
645
: [p] "+r"(p)
646
: [stride] "r"(stride), [thresh] "r"(thresh)
647
: "memory", "r4", "r5", "r6", QRegs
648
);
649
}
650
651
#undef LOAD8x4
652
#undef STORE8x2
653
654
#endif // WEBP_USE_INTRINSICS
655
656
static void SimpleVFilter16i_NEON(uint8_t* p, int stride, int thresh) {
657
uint32_t k;
658
for (k = 3; k != 0; --k) {
659
p += 4 * stride;
660
SimpleVFilter16_NEON(p, stride, thresh);
661
}
662
}
663
664
static void SimpleHFilter16i_NEON(uint8_t* p, int stride, int thresh) {
665
uint32_t k;
666
for (k = 3; k != 0; --k) {
667
p += 4;
668
SimpleHFilter16_NEON(p, stride, thresh);
669
}
670
}
671
672
//------------------------------------------------------------------------------
673
// Complex In-loop filtering (Paragraph 15.3)
674
675
static uint8x16_t NeedsHev_NEON(const uint8x16_t p1, const uint8x16_t p0,
676
const uint8x16_t q0, const uint8x16_t q1,
677
int hev_thresh) {
678
const uint8x16_t hev_thresh_v = vdupq_n_u8((uint8_t)hev_thresh);
679
const uint8x16_t a_p1_p0 = vabdq_u8(p1, p0); // abs(p1 - p0)
680
const uint8x16_t a_q1_q0 = vabdq_u8(q1, q0); // abs(q1 - q0)
681
const uint8x16_t a_max = vmaxq_u8(a_p1_p0, a_q1_q0);
682
const uint8x16_t mask = vcgtq_u8(a_max, hev_thresh_v);
683
return mask;
684
}
685
686
static uint8x16_t NeedsFilter2_NEON(const uint8x16_t p3, const uint8x16_t p2,
687
const uint8x16_t p1, const uint8x16_t p0,
688
const uint8x16_t q0, const uint8x16_t q1,
689
const uint8x16_t q2, const uint8x16_t q3,
690
int ithresh, int thresh) {
691
const uint8x16_t ithresh_v = vdupq_n_u8((uint8_t)ithresh);
692
const uint8x16_t a_p3_p2 = vabdq_u8(p3, p2); // abs(p3 - p2)
693
const uint8x16_t a_p2_p1 = vabdq_u8(p2, p1); // abs(p2 - p1)
694
const uint8x16_t a_p1_p0 = vabdq_u8(p1, p0); // abs(p1 - p0)
695
const uint8x16_t a_q3_q2 = vabdq_u8(q3, q2); // abs(q3 - q2)
696
const uint8x16_t a_q2_q1 = vabdq_u8(q2, q1); // abs(q2 - q1)
697
const uint8x16_t a_q1_q0 = vabdq_u8(q1, q0); // abs(q1 - q0)
698
const uint8x16_t max1 = vmaxq_u8(a_p3_p2, a_p2_p1);
699
const uint8x16_t max2 = vmaxq_u8(a_p1_p0, a_q3_q2);
700
const uint8x16_t max3 = vmaxq_u8(a_q2_q1, a_q1_q0);
701
const uint8x16_t max12 = vmaxq_u8(max1, max2);
702
const uint8x16_t max123 = vmaxq_u8(max12, max3);
703
const uint8x16_t mask2 = vcgeq_u8(ithresh_v, max123);
704
const uint8x16_t mask1 = NeedsFilter_NEON(p1, p0, q0, q1, thresh);
705
const uint8x16_t mask = vandq_u8(mask1, mask2);
706
return mask;
707
}
708
709
// 4-points filter
710
711
static void ApplyFilter4_NEON(
712
const int8x16_t p1, const int8x16_t p0,
713
const int8x16_t q0, const int8x16_t q1,
714
const int8x16_t delta0,
715
uint8x16_t* const op1, uint8x16_t* const op0,
716
uint8x16_t* const oq0, uint8x16_t* const oq1) {
717
const int8x16_t kCst3 = vdupq_n_s8(0x03);
718
const int8x16_t kCst4 = vdupq_n_s8(0x04);
719
const int8x16_t delta1 = vqaddq_s8(delta0, kCst4);
720
const int8x16_t delta2 = vqaddq_s8(delta0, kCst3);
721
const int8x16_t a1 = vshrq_n_s8(delta1, 3);
722
const int8x16_t a2 = vshrq_n_s8(delta2, 3);
723
const int8x16_t a3 = vrshrq_n_s8(a1, 1); // a3 = (a1 + 1) >> 1
724
*op0 = FlipSignBack_NEON(vqaddq_s8(p0, a2)); // clip(p0 + a2)
725
*oq0 = FlipSignBack_NEON(vqsubq_s8(q0, a1)); // clip(q0 - a1)
726
*op1 = FlipSignBack_NEON(vqaddq_s8(p1, a3)); // clip(p1 + a3)
727
*oq1 = FlipSignBack_NEON(vqsubq_s8(q1, a3)); // clip(q1 - a3)
728
}
729
730
static void DoFilter4_NEON(
731
const uint8x16_t p1, const uint8x16_t p0,
732
const uint8x16_t q0, const uint8x16_t q1,
733
const uint8x16_t mask, const uint8x16_t hev_mask,
734
uint8x16_t* const op1, uint8x16_t* const op0,
735
uint8x16_t* const oq0, uint8x16_t* const oq1) {
736
// This is a fused version of DoFilter2() calling ApplyFilter2 directly
737
const int8x16_t p1s = FlipSign_NEON(p1);
738
int8x16_t p0s = FlipSign_NEON(p0);
739
int8x16_t q0s = FlipSign_NEON(q0);
740
const int8x16_t q1s = FlipSign_NEON(q1);
741
const uint8x16_t simple_lf_mask = vandq_u8(mask, hev_mask);
742
743
// do_filter2 part (simple loopfilter on pixels with hev)
744
{
745
const int8x16_t delta = GetBaseDelta_NEON(p1s, p0s, q0s, q1s);
746
const int8x16_t simple_lf_delta =
747
vandq_s8(delta, vreinterpretq_s8_u8(simple_lf_mask));
748
ApplyFilter2NoFlip_NEON(p0s, q0s, simple_lf_delta, &p0s, &q0s);
749
}
750
751
// do_filter4 part (complex loopfilter on pixels without hev)
752
{
753
const int8x16_t delta0 = GetBaseDelta0_NEON(p0s, q0s);
754
// we use: (mask & hev_mask) ^ mask = mask & !hev_mask
755
const uint8x16_t complex_lf_mask = veorq_u8(simple_lf_mask, mask);
756
const int8x16_t complex_lf_delta =
757
vandq_s8(delta0, vreinterpretq_s8_u8(complex_lf_mask));
758
ApplyFilter4_NEON(p1s, p0s, q0s, q1s, complex_lf_delta, op1, op0, oq0, oq1);
759
}
760
}
761
762
// 6-points filter
763
764
static void ApplyFilter6_NEON(
765
const int8x16_t p2, const int8x16_t p1, const int8x16_t p0,
766
const int8x16_t q0, const int8x16_t q1, const int8x16_t q2,
767
const int8x16_t delta,
768
uint8x16_t* const op2, uint8x16_t* const op1, uint8x16_t* const op0,
769
uint8x16_t* const oq0, uint8x16_t* const oq1, uint8x16_t* const oq2) {
770
// We have to compute: X = (9*a+63) >> 7, Y = (18*a+63)>>7, Z = (27*a+63) >> 7
771
// Turns out, there's a common sub-expression S=9 * a - 1 that can be used
772
// with the special vqrshrn_n_s16 rounding-shift-and-narrow instruction:
773
// X = (S + 64) >> 7, Y = (S + 32) >> 6, Z = (18 * a + S + 64) >> 7
774
const int8x8_t delta_lo = vget_low_s8(delta);
775
const int8x8_t delta_hi = vget_high_s8(delta);
776
const int8x8_t kCst9 = vdup_n_s8(9);
777
const int16x8_t kCstm1 = vdupq_n_s16(-1);
778
const int8x8_t kCst18 = vdup_n_s8(18);
779
const int16x8_t S_lo = vmlal_s8(kCstm1, kCst9, delta_lo); // S = 9 * a - 1
780
const int16x8_t S_hi = vmlal_s8(kCstm1, kCst9, delta_hi);
781
const int16x8_t Z_lo = vmlal_s8(S_lo, kCst18, delta_lo); // S + 18 * a
782
const int16x8_t Z_hi = vmlal_s8(S_hi, kCst18, delta_hi);
783
const int8x8_t a3_lo = vqrshrn_n_s16(S_lo, 7); // (9 * a + 63) >> 7
784
const int8x8_t a3_hi = vqrshrn_n_s16(S_hi, 7);
785
const int8x8_t a2_lo = vqrshrn_n_s16(S_lo, 6); // (9 * a + 31) >> 6
786
const int8x8_t a2_hi = vqrshrn_n_s16(S_hi, 6);
787
const int8x8_t a1_lo = vqrshrn_n_s16(Z_lo, 7); // (27 * a + 63) >> 7
788
const int8x8_t a1_hi = vqrshrn_n_s16(Z_hi, 7);
789
const int8x16_t a1 = vcombine_s8(a1_lo, a1_hi);
790
const int8x16_t a2 = vcombine_s8(a2_lo, a2_hi);
791
const int8x16_t a3 = vcombine_s8(a3_lo, a3_hi);
792
793
*op0 = FlipSignBack_NEON(vqaddq_s8(p0, a1)); // clip(p0 + a1)
794
*oq0 = FlipSignBack_NEON(vqsubq_s8(q0, a1)); // clip(q0 - q1)
795
*oq1 = FlipSignBack_NEON(vqsubq_s8(q1, a2)); // clip(q1 - a2)
796
*op1 = FlipSignBack_NEON(vqaddq_s8(p1, a2)); // clip(p1 + a2)
797
*oq2 = FlipSignBack_NEON(vqsubq_s8(q2, a3)); // clip(q2 - a3)
798
*op2 = FlipSignBack_NEON(vqaddq_s8(p2, a3)); // clip(p2 + a3)
799
}
800
801
static void DoFilter6_NEON(
802
const uint8x16_t p2, const uint8x16_t p1, const uint8x16_t p0,
803
const uint8x16_t q0, const uint8x16_t q1, const uint8x16_t q2,
804
const uint8x16_t mask, const uint8x16_t hev_mask,
805
uint8x16_t* const op2, uint8x16_t* const op1, uint8x16_t* const op0,
806
uint8x16_t* const oq0, uint8x16_t* const oq1, uint8x16_t* const oq2) {
807
// This is a fused version of DoFilter2() calling ApplyFilter2 directly
808
const int8x16_t p2s = FlipSign_NEON(p2);
809
const int8x16_t p1s = FlipSign_NEON(p1);
810
int8x16_t p0s = FlipSign_NEON(p0);
811
int8x16_t q0s = FlipSign_NEON(q0);
812
const int8x16_t q1s = FlipSign_NEON(q1);
813
const int8x16_t q2s = FlipSign_NEON(q2);
814
const uint8x16_t simple_lf_mask = vandq_u8(mask, hev_mask);
815
const int8x16_t delta0 = GetBaseDelta_NEON(p1s, p0s, q0s, q1s);
816
817
// do_filter2 part (simple loopfilter on pixels with hev)
818
{
819
const int8x16_t simple_lf_delta =
820
vandq_s8(delta0, vreinterpretq_s8_u8(simple_lf_mask));
821
ApplyFilter2NoFlip_NEON(p0s, q0s, simple_lf_delta, &p0s, &q0s);
822
}
823
824
// do_filter6 part (complex loopfilter on pixels without hev)
825
{
826
// we use: (mask & hev_mask) ^ mask = mask & !hev_mask
827
const uint8x16_t complex_lf_mask = veorq_u8(simple_lf_mask, mask);
828
const int8x16_t complex_lf_delta =
829
vandq_s8(delta0, vreinterpretq_s8_u8(complex_lf_mask));
830
ApplyFilter6_NEON(p2s, p1s, p0s, q0s, q1s, q2s, complex_lf_delta,
831
op2, op1, op0, oq0, oq1, oq2);
832
}
833
}
834
835
// on macroblock edges
836
837
static void VFilter16_NEON(uint8_t* p, int stride,
838
int thresh, int ithresh, int hev_thresh) {
839
uint8x16_t p3, p2, p1, p0, q0, q1, q2, q3;
840
Load16x8_NEON(p, stride, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
841
{
842
const uint8x16_t mask = NeedsFilter2_NEON(p3, p2, p1, p0, q0, q1, q2, q3,
843
ithresh, thresh);
844
const uint8x16_t hev_mask = NeedsHev_NEON(p1, p0, q0, q1, hev_thresh);
845
uint8x16_t op2, op1, op0, oq0, oq1, oq2;
846
DoFilter6_NEON(p2, p1, p0, q0, q1, q2, mask, hev_mask,
847
&op2, &op1, &op0, &oq0, &oq1, &oq2);
848
Store16x2_NEON(op2, op1, p - 2 * stride, stride);
849
Store16x2_NEON(op0, oq0, p + 0 * stride, stride);
850
Store16x2_NEON(oq1, oq2, p + 2 * stride, stride);
851
}
852
}
853
854
static void HFilter16_NEON(uint8_t* p, int stride,
855
int thresh, int ithresh, int hev_thresh) {
856
uint8x16_t p3, p2, p1, p0, q0, q1, q2, q3;
857
Load8x16_NEON(p, stride, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
858
{
859
const uint8x16_t mask = NeedsFilter2_NEON(p3, p2, p1, p0, q0, q1, q2, q3,
860
ithresh, thresh);
861
const uint8x16_t hev_mask = NeedsHev_NEON(p1, p0, q0, q1, hev_thresh);
862
uint8x16_t op2, op1, op0, oq0, oq1, oq2;
863
DoFilter6_NEON(p2, p1, p0, q0, q1, q2, mask, hev_mask,
864
&op2, &op1, &op0, &oq0, &oq1, &oq2);
865
Store2x16_NEON(op2, op1, p - 2, stride);
866
Store2x16_NEON(op0, oq0, p + 0, stride);
867
Store2x16_NEON(oq1, oq2, p + 2, stride);
868
}
869
}
870
871
// on three inner edges
872
static void VFilter16i_NEON(uint8_t* p, int stride,
873
int thresh, int ithresh, int hev_thresh) {
874
uint32_t k;
875
uint8x16_t p3, p2, p1, p0;
876
Load16x4_NEON(p + 2 * stride, stride, &p3, &p2, &p1, &p0);
877
for (k = 3; k != 0; --k) {
878
uint8x16_t q0, q1, q2, q3;
879
p += 4 * stride;
880
Load16x4_NEON(p + 2 * stride, stride, &q0, &q1, &q2, &q3);
881
{
882
const uint8x16_t mask =
883
NeedsFilter2_NEON(p3, p2, p1, p0, q0, q1, q2, q3, ithresh, thresh);
884
const uint8x16_t hev_mask = NeedsHev_NEON(p1, p0, q0, q1, hev_thresh);
885
// p3 and p2 are not just temporary variables here: they will be
886
// re-used for next span. And q2/q3 will become p1/p0 accordingly.
887
DoFilter4_NEON(p1, p0, q0, q1, mask, hev_mask, &p1, &p0, &p3, &p2);
888
Store16x4_NEON(p1, p0, p3, p2, p, stride);
889
p1 = q2;
890
p0 = q3;
891
}
892
}
893
}
894
895
#if !defined(WORK_AROUND_GCC)
896
static void HFilter16i_NEON(uint8_t* p, int stride,
897
int thresh, int ithresh, int hev_thresh) {
898
uint32_t k;
899
uint8x16_t p3, p2, p1, p0;
900
Load4x16_NEON(p + 2, stride, &p3, &p2, &p1, &p0);
901
for (k = 3; k != 0; --k) {
902
uint8x16_t q0, q1, q2, q3;
903
p += 4;
904
Load4x16_NEON(p + 2, stride, &q0, &q1, &q2, &q3);
905
{
906
const uint8x16_t mask =
907
NeedsFilter2_NEON(p3, p2, p1, p0, q0, q1, q2, q3, ithresh, thresh);
908
const uint8x16_t hev_mask = NeedsHev_NEON(p1, p0, q0, q1, hev_thresh);
909
DoFilter4_NEON(p1, p0, q0, q1, mask, hev_mask, &p1, &p0, &p3, &p2);
910
Store4x16_NEON(p1, p0, p3, p2, p, stride);
911
p1 = q2;
912
p0 = q3;
913
}
914
}
915
}
916
#endif // !WORK_AROUND_GCC
917
918
// 8-pixels wide variant, for chroma filtering
919
static void VFilter8_NEON(uint8_t* WEBP_RESTRICT u, uint8_t* WEBP_RESTRICT v,
920
int stride, int thresh, int ithresh, int hev_thresh) {
921
uint8x16_t p3, p2, p1, p0, q0, q1, q2, q3;
922
Load8x8x2_NEON(u, v, stride, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
923
{
924
const uint8x16_t mask = NeedsFilter2_NEON(p3, p2, p1, p0, q0, q1, q2, q3,
925
ithresh, thresh);
926
const uint8x16_t hev_mask = NeedsHev_NEON(p1, p0, q0, q1, hev_thresh);
927
uint8x16_t op2, op1, op0, oq0, oq1, oq2;
928
DoFilter6_NEON(p2, p1, p0, q0, q1, q2, mask, hev_mask,
929
&op2, &op1, &op0, &oq0, &oq1, &oq2);
930
Store8x2x2_NEON(op2, op1, u - 2 * stride, v - 2 * stride, stride);
931
Store8x2x2_NEON(op0, oq0, u + 0 * stride, v + 0 * stride, stride);
932
Store8x2x2_NEON(oq1, oq2, u + 2 * stride, v + 2 * stride, stride);
933
}
934
}
935
static void VFilter8i_NEON(uint8_t* WEBP_RESTRICT u, uint8_t* WEBP_RESTRICT v,
936
int stride,
937
int thresh, int ithresh, int hev_thresh) {
938
uint8x16_t p3, p2, p1, p0, q0, q1, q2, q3;
939
u += 4 * stride;
940
v += 4 * stride;
941
Load8x8x2_NEON(u, v, stride, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
942
{
943
const uint8x16_t mask = NeedsFilter2_NEON(p3, p2, p1, p0, q0, q1, q2, q3,
944
ithresh, thresh);
945
const uint8x16_t hev_mask = NeedsHev_NEON(p1, p0, q0, q1, hev_thresh);
946
uint8x16_t op1, op0, oq0, oq1;
947
DoFilter4_NEON(p1, p0, q0, q1, mask, hev_mask, &op1, &op0, &oq0, &oq1);
948
Store8x4x2_NEON(op1, op0, oq0, oq1, u, v, stride);
949
}
950
}
951
952
#if !defined(WORK_AROUND_GCC)
953
static void HFilter8_NEON(uint8_t* WEBP_RESTRICT u, uint8_t* WEBP_RESTRICT v,
954
int stride, int thresh, int ithresh, int hev_thresh) {
955
uint8x16_t p3, p2, p1, p0, q0, q1, q2, q3;
956
Load8x8x2T_NEON(u, v, stride, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
957
{
958
const uint8x16_t mask = NeedsFilter2_NEON(p3, p2, p1, p0, q0, q1, q2, q3,
959
ithresh, thresh);
960
const uint8x16_t hev_mask = NeedsHev_NEON(p1, p0, q0, q1, hev_thresh);
961
uint8x16_t op2, op1, op0, oq0, oq1, oq2;
962
DoFilter6_NEON(p2, p1, p0, q0, q1, q2, mask, hev_mask,
963
&op2, &op1, &op0, &oq0, &oq1, &oq2);
964
Store6x8x2_NEON(op2, op1, op0, oq0, oq1, oq2, u, v, stride);
965
}
966
}
967
968
static void HFilter8i_NEON(uint8_t* WEBP_RESTRICT u, uint8_t* WEBP_RESTRICT v,
969
int stride,
970
int thresh, int ithresh, int hev_thresh) {
971
uint8x16_t p3, p2, p1, p0, q0, q1, q2, q3;
972
u += 4;
973
v += 4;
974
Load8x8x2T_NEON(u, v, stride, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
975
{
976
const uint8x16_t mask = NeedsFilter2_NEON(p3, p2, p1, p0, q0, q1, q2, q3,
977
ithresh, thresh);
978
const uint8x16_t hev_mask = NeedsHev_NEON(p1, p0, q0, q1, hev_thresh);
979
uint8x16_t op1, op0, oq0, oq1;
980
DoFilter4_NEON(p1, p0, q0, q1, mask, hev_mask, &op1, &op0, &oq0, &oq1);
981
Store4x8x2_NEON(op1, op0, oq0, oq1, u, v, stride);
982
}
983
}
984
#endif // !WORK_AROUND_GCC
985
986
//-----------------------------------------------------------------------------
987
// Inverse transforms (Paragraph 14.4)
988
989
// Technically these are unsigned but vqdmulh is only available in signed.
990
// vqdmulh returns high half (effectively >> 16) but also doubles the value,
991
// changing the >> 16 to >> 15 and requiring an additional >> 1.
992
// We use this to our advantage with kC2. The canonical value is 35468.
993
// However, the high bit is set so treating it as signed will give incorrect
994
// results. We avoid this by down shifting by 1 here to clear the highest bit.
995
// Combined with the doubling effect of vqdmulh we get >> 16.
996
// This can not be applied to kC1 because the lowest bit is set. Down shifting
997
// the constant would reduce precision.
998
999
// libwebp uses a trick to avoid some extra addition that libvpx does.
1000
// Instead of:
1001
// temp2 = ip[12] + ((ip[12] * cospi8sqrt2minus1) >> 16);
1002
// libwebp adds 1 << 16 to cospi8sqrt2minus1 (kC1). However, this causes the
1003
// same issue with kC1 and vqdmulh that we work around by down shifting kC2
1004
1005
static const int16_t kC1 = WEBP_TRANSFORM_AC3_C1;
1006
static const int16_t kC2 =
1007
WEBP_TRANSFORM_AC3_C2 / 2; // half of kC2, actually. See comment above.
1008
1009
#if defined(WEBP_USE_INTRINSICS)
1010
static WEBP_INLINE void Transpose8x2_NEON(const int16x8_t in0,
1011
const int16x8_t in1,
1012
int16x8x2_t* const out) {
1013
// a0 a1 a2 a3 | b0 b1 b2 b3 => a0 b0 c0 d0 | a1 b1 c1 d1
1014
// c0 c1 c2 c3 | d0 d1 d2 d3 a2 b2 c2 d2 | a3 b3 c3 d3
1015
const int16x8x2_t tmp0 = vzipq_s16(in0, in1); // a0 c0 a1 c1 a2 c2 ...
1016
// b0 d0 b1 d1 b2 d2 ...
1017
*out = vzipq_s16(tmp0.val[0], tmp0.val[1]);
1018
}
1019
1020
static WEBP_INLINE void TransformPass_NEON(int16x8x2_t* const rows) {
1021
// {rows} = in0 | in4
1022
// in8 | in12
1023
// B1 = in4 | in12
1024
const int16x8_t B1 =
1025
vcombine_s16(vget_high_s16(rows->val[0]), vget_high_s16(rows->val[1]));
1026
// C0 = kC1 * in4 | kC1 * in12
1027
// C1 = kC2 * in4 | kC2 * in12
1028
const int16x8_t C0 = vsraq_n_s16(B1, vqdmulhq_n_s16(B1, kC1), 1);
1029
const int16x8_t C1 = vqdmulhq_n_s16(B1, kC2);
1030
const int16x4_t a = vqadd_s16(vget_low_s16(rows->val[0]),
1031
vget_low_s16(rows->val[1])); // in0 + in8
1032
const int16x4_t b = vqsub_s16(vget_low_s16(rows->val[0]),
1033
vget_low_s16(rows->val[1])); // in0 - in8
1034
// c = kC2 * in4 - kC1 * in12
1035
// d = kC1 * in4 + kC2 * in12
1036
const int16x4_t c = vqsub_s16(vget_low_s16(C1), vget_high_s16(C0));
1037
const int16x4_t d = vqadd_s16(vget_low_s16(C0), vget_high_s16(C1));
1038
const int16x8_t D0 = vcombine_s16(a, b); // D0 = a | b
1039
const int16x8_t D1 = vcombine_s16(d, c); // D1 = d | c
1040
const int16x8_t E0 = vqaddq_s16(D0, D1); // a+d | b+c
1041
const int16x8_t E_tmp = vqsubq_s16(D0, D1); // a-d | b-c
1042
const int16x8_t E1 = vcombine_s16(vget_high_s16(E_tmp), vget_low_s16(E_tmp));
1043
Transpose8x2_NEON(E0, E1, rows);
1044
}
1045
1046
static void TransformOne_NEON(const int16_t* WEBP_RESTRICT in,
1047
uint8_t* WEBP_RESTRICT dst) {
1048
int16x8x2_t rows;
1049
INIT_VECTOR2(rows, vld1q_s16(in + 0), vld1q_s16(in + 8));
1050
TransformPass_NEON(&rows);
1051
TransformPass_NEON(&rows);
1052
Add4x4_NEON(rows.val[0], rows.val[1], dst);
1053
}
1054
1055
#else
1056
1057
static void TransformOne_NEON(const int16_t* WEBP_RESTRICT in,
1058
uint8_t* WEBP_RESTRICT dst) {
1059
const int kBPS = BPS;
1060
// kC1, kC2. Padded because vld1.16 loads 8 bytes
1061
const int16_t constants[4] = { kC1, kC2, 0, 0 };
1062
/* Adapted from libvpx: vp8/common/arm/neon/shortidct4x4llm_neon.asm */
1063
__asm__ volatile (
1064
"vld1.16 {q1, q2}, [%[in]] \n"
1065
"vld1.16 {d0}, [%[constants]] \n"
1066
1067
/* d2: in[0]
1068
* d3: in[8]
1069
* d4: in[4]
1070
* d5: in[12]
1071
*/
1072
"vswp d3, d4 \n"
1073
1074
/* q8 = {in[4], in[12]} * kC1 * 2 >> 16
1075
* q9 = {in[4], in[12]} * kC2 >> 16
1076
*/
1077
"vqdmulh.s16 q8, q2, d0[0] \n"
1078
"vqdmulh.s16 q9, q2, d0[1] \n"
1079
1080
/* d22 = a = in[0] + in[8]
1081
* d23 = b = in[0] - in[8]
1082
*/
1083
"vqadd.s16 d22, d2, d3 \n"
1084
"vqsub.s16 d23, d2, d3 \n"
1085
1086
/* The multiplication should be x * kC1 >> 16
1087
* However, with vqdmulh we get x * kC1 * 2 >> 16
1088
* (multiply, double, return high half)
1089
* We avoided this in kC2 by pre-shifting the constant.
1090
* q8 = in[4]/[12] * kC1 >> 16
1091
*/
1092
"vshr.s16 q8, q8, #1 \n"
1093
1094
/* Add {in[4], in[12]} back after the multiplication. This is handled by
1095
* adding 1 << 16 to kC1 in the libwebp C code.
1096
*/
1097
"vqadd.s16 q8, q2, q8 \n"
1098
1099
/* d20 = c = in[4]*kC2 - in[12]*kC1
1100
* d21 = d = in[4]*kC1 + in[12]*kC2
1101
*/
1102
"vqsub.s16 d20, d18, d17 \n"
1103
"vqadd.s16 d21, d19, d16 \n"
1104
1105
/* d2 = tmp[0] = a + d
1106
* d3 = tmp[1] = b + c
1107
* d4 = tmp[2] = b - c
1108
* d5 = tmp[3] = a - d
1109
*/
1110
"vqadd.s16 d2, d22, d21 \n"
1111
"vqadd.s16 d3, d23, d20 \n"
1112
"vqsub.s16 d4, d23, d20 \n"
1113
"vqsub.s16 d5, d22, d21 \n"
1114
1115
"vzip.16 q1, q2 \n"
1116
"vzip.16 q1, q2 \n"
1117
1118
"vswp d3, d4 \n"
1119
1120
/* q8 = {tmp[4], tmp[12]} * kC1 * 2 >> 16
1121
* q9 = {tmp[4], tmp[12]} * kC2 >> 16
1122
*/
1123
"vqdmulh.s16 q8, q2, d0[0] \n"
1124
"vqdmulh.s16 q9, q2, d0[1] \n"
1125
1126
/* d22 = a = tmp[0] + tmp[8]
1127
* d23 = b = tmp[0] - tmp[8]
1128
*/
1129
"vqadd.s16 d22, d2, d3 \n"
1130
"vqsub.s16 d23, d2, d3 \n"
1131
1132
/* See long winded explanations prior */
1133
"vshr.s16 q8, q8, #1 \n"
1134
"vqadd.s16 q8, q2, q8 \n"
1135
1136
/* d20 = c = in[4]*kC2 - in[12]*kC1
1137
* d21 = d = in[4]*kC1 + in[12]*kC2
1138
*/
1139
"vqsub.s16 d20, d18, d17 \n"
1140
"vqadd.s16 d21, d19, d16 \n"
1141
1142
/* d2 = tmp[0] = a + d
1143
* d3 = tmp[1] = b + c
1144
* d4 = tmp[2] = b - c
1145
* d5 = tmp[3] = a - d
1146
*/
1147
"vqadd.s16 d2, d22, d21 \n"
1148
"vqadd.s16 d3, d23, d20 \n"
1149
"vqsub.s16 d4, d23, d20 \n"
1150
"vqsub.s16 d5, d22, d21 \n"
1151
1152
"vld1.32 d6[0], [%[dst]], %[kBPS] \n"
1153
"vld1.32 d6[1], [%[dst]], %[kBPS] \n"
1154
"vld1.32 d7[0], [%[dst]], %[kBPS] \n"
1155
"vld1.32 d7[1], [%[dst]], %[kBPS] \n"
1156
1157
"sub %[dst], %[dst], %[kBPS], lsl #2 \n"
1158
1159
/* (val) + 4 >> 3 */
1160
"vrshr.s16 d2, d2, #3 \n"
1161
"vrshr.s16 d3, d3, #3 \n"
1162
"vrshr.s16 d4, d4, #3 \n"
1163
"vrshr.s16 d5, d5, #3 \n"
1164
1165
"vzip.16 q1, q2 \n"
1166
"vzip.16 q1, q2 \n"
1167
1168
/* Must accumulate before saturating */
1169
"vmovl.u8 q8, d6 \n"
1170
"vmovl.u8 q9, d7 \n"
1171
1172
"vqadd.s16 q1, q1, q8 \n"
1173
"vqadd.s16 q2, q2, q9 \n"
1174
1175
"vqmovun.s16 d0, q1 \n"
1176
"vqmovun.s16 d1, q2 \n"
1177
1178
"vst1.32 d0[0], [%[dst]], %[kBPS] \n"
1179
"vst1.32 d0[1], [%[dst]], %[kBPS] \n"
1180
"vst1.32 d1[0], [%[dst]], %[kBPS] \n"
1181
"vst1.32 d1[1], [%[dst]] \n"
1182
1183
: [in] "+r"(in), [dst] "+r"(dst) /* modified registers */
1184
: [kBPS] "r"(kBPS), [constants] "r"(constants) /* constants */
1185
: "memory", "q0", "q1", "q2", "q8", "q9", "q10", "q11" /* clobbered */
1186
);
1187
}
1188
1189
#endif // WEBP_USE_INTRINSICS
1190
1191
static void TransformTwo_NEON(const int16_t* WEBP_RESTRICT in,
1192
uint8_t* WEBP_RESTRICT dst, int do_two) {
1193
TransformOne_NEON(in, dst);
1194
if (do_two) {
1195
TransformOne_NEON(in + 16, dst + 4);
1196
}
1197
}
1198
1199
static void TransformDC_NEON(const int16_t* WEBP_RESTRICT in,
1200
uint8_t* WEBP_RESTRICT dst) {
1201
const int16x8_t DC = vdupq_n_s16(in[0]);
1202
Add4x4_NEON(DC, DC, dst);
1203
}
1204
1205
//------------------------------------------------------------------------------
1206
1207
#define STORE_WHT(dst, col, rows) do { \
1208
*dst = vgetq_lane_s32(rows.val[0], col); (dst) += 16; \
1209
*dst = vgetq_lane_s32(rows.val[1], col); (dst) += 16; \
1210
*dst = vgetq_lane_s32(rows.val[2], col); (dst) += 16; \
1211
*dst = vgetq_lane_s32(rows.val[3], col); (dst) += 16; \
1212
} while (0)
1213
1214
static void TransformWHT_NEON(const int16_t* WEBP_RESTRICT in,
1215
int16_t* WEBP_RESTRICT out) {
1216
int32x4x4_t tmp;
1217
1218
{
1219
// Load the source.
1220
const int16x4_t in00_03 = vld1_s16(in + 0);
1221
const int16x4_t in04_07 = vld1_s16(in + 4);
1222
const int16x4_t in08_11 = vld1_s16(in + 8);
1223
const int16x4_t in12_15 = vld1_s16(in + 12);
1224
const int32x4_t a0 = vaddl_s16(in00_03, in12_15); // in[0..3] + in[12..15]
1225
const int32x4_t a1 = vaddl_s16(in04_07, in08_11); // in[4..7] + in[8..11]
1226
const int32x4_t a2 = vsubl_s16(in04_07, in08_11); // in[4..7] - in[8..11]
1227
const int32x4_t a3 = vsubl_s16(in00_03, in12_15); // in[0..3] - in[12..15]
1228
tmp.val[0] = vaddq_s32(a0, a1);
1229
tmp.val[1] = vaddq_s32(a3, a2);
1230
tmp.val[2] = vsubq_s32(a0, a1);
1231
tmp.val[3] = vsubq_s32(a3, a2);
1232
// Arrange the temporary results column-wise.
1233
tmp = Transpose4x4_NEON(tmp);
1234
}
1235
1236
{
1237
const int32x4_t kCst3 = vdupq_n_s32(3);
1238
const int32x4_t dc = vaddq_s32(tmp.val[0], kCst3); // add rounder
1239
const int32x4_t a0 = vaddq_s32(dc, tmp.val[3]);
1240
const int32x4_t a1 = vaddq_s32(tmp.val[1], tmp.val[2]);
1241
const int32x4_t a2 = vsubq_s32(tmp.val[1], tmp.val[2]);
1242
const int32x4_t a3 = vsubq_s32(dc, tmp.val[3]);
1243
1244
tmp.val[0] = vaddq_s32(a0, a1);
1245
tmp.val[1] = vaddq_s32(a3, a2);
1246
tmp.val[2] = vsubq_s32(a0, a1);
1247
tmp.val[3] = vsubq_s32(a3, a2);
1248
1249
// right shift the results by 3.
1250
tmp.val[0] = vshrq_n_s32(tmp.val[0], 3);
1251
tmp.val[1] = vshrq_n_s32(tmp.val[1], 3);
1252
tmp.val[2] = vshrq_n_s32(tmp.val[2], 3);
1253
tmp.val[3] = vshrq_n_s32(tmp.val[3], 3);
1254
1255
STORE_WHT(out, 0, tmp);
1256
STORE_WHT(out, 1, tmp);
1257
STORE_WHT(out, 2, tmp);
1258
STORE_WHT(out, 3, tmp);
1259
}
1260
}
1261
1262
#undef STORE_WHT
1263
1264
//------------------------------------------------------------------------------
1265
1266
static void TransformAC3_NEON(const int16_t* WEBP_RESTRICT in,
1267
uint8_t* WEBP_RESTRICT dst) {
1268
const int16x4_t A = vld1_dup_s16(in);
1269
const int16x4_t c4 = vdup_n_s16(WEBP_TRANSFORM_AC3_MUL2(in[4]));
1270
const int16x4_t d4 = vdup_n_s16(WEBP_TRANSFORM_AC3_MUL1(in[4]));
1271
const int c1 = WEBP_TRANSFORM_AC3_MUL2(in[1]);
1272
const int d1 = WEBP_TRANSFORM_AC3_MUL1(in[1]);
1273
const uint64_t cd = (uint64_t)( d1 & 0xffff) << 0 |
1274
(uint64_t)( c1 & 0xffff) << 16 |
1275
(uint64_t)(-c1 & 0xffff) << 32 |
1276
(uint64_t)(-d1 & 0xffff) << 48;
1277
const int16x4_t CD = vcreate_s16(cd);
1278
const int16x4_t B = vqadd_s16(A, CD);
1279
const int16x8_t m0_m1 = vcombine_s16(vqadd_s16(B, d4), vqadd_s16(B, c4));
1280
const int16x8_t m2_m3 = vcombine_s16(vqsub_s16(B, c4), vqsub_s16(B, d4));
1281
Add4x4_NEON(m0_m1, m2_m3, dst);
1282
}
1283
1284
//------------------------------------------------------------------------------
1285
// 4x4
1286
1287
static void DC4_NEON(uint8_t* dst) { // DC
1288
const uint8x8_t A = vld1_u8(dst - BPS); // top row
1289
const uint16x4_t p0 = vpaddl_u8(A); // cascading summation of the top
1290
const uint16x4_t p1 = vpadd_u16(p0, p0);
1291
const uint8x8_t L0 = vld1_u8(dst + 0 * BPS - 1);
1292
const uint8x8_t L1 = vld1_u8(dst + 1 * BPS - 1);
1293
const uint8x8_t L2 = vld1_u8(dst + 2 * BPS - 1);
1294
const uint8x8_t L3 = vld1_u8(dst + 3 * BPS - 1);
1295
const uint16x8_t s0 = vaddl_u8(L0, L1);
1296
const uint16x8_t s1 = vaddl_u8(L2, L3);
1297
const uint16x8_t s01 = vaddq_u16(s0, s1);
1298
const uint16x8_t sum = vaddq_u16(s01, vcombine_u16(p1, p1));
1299
const uint8x8_t dc0 = vrshrn_n_u16(sum, 3); // (sum + 4) >> 3
1300
const uint8x8_t dc = vdup_lane_u8(dc0, 0);
1301
int i;
1302
for (i = 0; i < 4; ++i) {
1303
vst1_lane_u32((uint32_t*)(dst + i * BPS), vreinterpret_u32_u8(dc), 0);
1304
}
1305
}
1306
1307
// TrueMotion (4x4 + 8x8)
1308
static WEBP_INLINE void TrueMotion_NEON(uint8_t* dst, int size) {
1309
const uint8x8_t TL = vld1_dup_u8(dst - BPS - 1); // top-left pixel 'A[-1]'
1310
const uint8x8_t T = vld1_u8(dst - BPS); // top row 'A[0..3]'
1311
const uint16x8_t d = vsubl_u8(T, TL); // A[c] - A[-1]
1312
int y;
1313
for (y = 0; y < size; y += 4) {
1314
// left edge
1315
const uint8x8_t L0 = vld1_dup_u8(dst + 0 * BPS - 1);
1316
const uint8x8_t L1 = vld1_dup_u8(dst + 1 * BPS - 1);
1317
const uint8x8_t L2 = vld1_dup_u8(dst + 2 * BPS - 1);
1318
const uint8x8_t L3 = vld1_dup_u8(dst + 3 * BPS - 1);
1319
// L[r] + A[c] - A[-1]
1320
const int16x8_t r0 = vreinterpretq_s16_u16(vaddw_u8(d, L0));
1321
const int16x8_t r1 = vreinterpretq_s16_u16(vaddw_u8(d, L1));
1322
const int16x8_t r2 = vreinterpretq_s16_u16(vaddw_u8(d, L2));
1323
const int16x8_t r3 = vreinterpretq_s16_u16(vaddw_u8(d, L3));
1324
// Saturate and store the result.
1325
const uint32x2_t r0_u32 = vreinterpret_u32_u8(vqmovun_s16(r0));
1326
const uint32x2_t r1_u32 = vreinterpret_u32_u8(vqmovun_s16(r1));
1327
const uint32x2_t r2_u32 = vreinterpret_u32_u8(vqmovun_s16(r2));
1328
const uint32x2_t r3_u32 = vreinterpret_u32_u8(vqmovun_s16(r3));
1329
if (size == 4) {
1330
vst1_lane_u32((uint32_t*)(dst + 0 * BPS), r0_u32, 0);
1331
vst1_lane_u32((uint32_t*)(dst + 1 * BPS), r1_u32, 0);
1332
vst1_lane_u32((uint32_t*)(dst + 2 * BPS), r2_u32, 0);
1333
vst1_lane_u32((uint32_t*)(dst + 3 * BPS), r3_u32, 0);
1334
} else {
1335
vst1_u32((uint32_t*)(dst + 0 * BPS), r0_u32);
1336
vst1_u32((uint32_t*)(dst + 1 * BPS), r1_u32);
1337
vst1_u32((uint32_t*)(dst + 2 * BPS), r2_u32);
1338
vst1_u32((uint32_t*)(dst + 3 * BPS), r3_u32);
1339
}
1340
dst += 4 * BPS;
1341
}
1342
}
1343
1344
static void TM4_NEON(uint8_t* dst) { TrueMotion_NEON(dst, 4); }
1345
1346
static void VE4_NEON(uint8_t* dst) { // vertical
1347
// NB: avoid vld1_u64 here as an alignment hint may be added -> SIGBUS.
1348
const uint64x1_t A0 = vreinterpret_u64_u8(vld1_u8(dst - BPS - 1)); // top row
1349
const uint64x1_t A1 = vshr_n_u64(A0, 8);
1350
const uint64x1_t A2 = vshr_n_u64(A0, 16);
1351
const uint8x8_t ABCDEFGH = vreinterpret_u8_u64(A0);
1352
const uint8x8_t BCDEFGH0 = vreinterpret_u8_u64(A1);
1353
const uint8x8_t CDEFGH00 = vreinterpret_u8_u64(A2);
1354
const uint8x8_t b = vhadd_u8(ABCDEFGH, CDEFGH00);
1355
const uint8x8_t avg = vrhadd_u8(b, BCDEFGH0);
1356
int i;
1357
for (i = 0; i < 4; ++i) {
1358
vst1_lane_u32((uint32_t*)(dst + i * BPS), vreinterpret_u32_u8(avg), 0);
1359
}
1360
}
1361
1362
static void RD4_NEON(uint8_t* dst) { // Down-right
1363
const uint8x8_t XABCD_u8 = vld1_u8(dst - BPS - 1);
1364
const uint64x1_t XABCD = vreinterpret_u64_u8(XABCD_u8);
1365
const uint64x1_t ____XABC = vshl_n_u64(XABCD, 32);
1366
const uint32_t I = dst[-1 + 0 * BPS];
1367
const uint32_t J = dst[-1 + 1 * BPS];
1368
const uint32_t K = dst[-1 + 2 * BPS];
1369
const uint32_t L = dst[-1 + 3 * BPS];
1370
const uint64x1_t LKJI____ =
1371
vcreate_u64((uint64_t)L | (K << 8) | (J << 16) | (I << 24));
1372
const uint64x1_t LKJIXABC = vorr_u64(LKJI____, ____XABC);
1373
const uint8x8_t KJIXABC_ = vreinterpret_u8_u64(vshr_n_u64(LKJIXABC, 8));
1374
const uint8x8_t JIXABC__ = vreinterpret_u8_u64(vshr_n_u64(LKJIXABC, 16));
1375
const uint8_t D = vget_lane_u8(XABCD_u8, 4);
1376
const uint8x8_t JIXABCD_ = vset_lane_u8(D, JIXABC__, 6);
1377
const uint8x8_t LKJIXABC_u8 = vreinterpret_u8_u64(LKJIXABC);
1378
const uint8x8_t avg1 = vhadd_u8(JIXABCD_, LKJIXABC_u8);
1379
const uint8x8_t avg2 = vrhadd_u8(avg1, KJIXABC_);
1380
const uint64x1_t avg2_u64 = vreinterpret_u64_u8(avg2);
1381
const uint32x2_t r3 = vreinterpret_u32_u8(avg2);
1382
const uint32x2_t r2 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 8));
1383
const uint32x2_t r1 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 16));
1384
const uint32x2_t r0 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 24));
1385
vst1_lane_u32((uint32_t*)(dst + 0 * BPS), r0, 0);
1386
vst1_lane_u32((uint32_t*)(dst + 1 * BPS), r1, 0);
1387
vst1_lane_u32((uint32_t*)(dst + 2 * BPS), r2, 0);
1388
vst1_lane_u32((uint32_t*)(dst + 3 * BPS), r3, 0);
1389
}
1390
1391
static void LD4_NEON(uint8_t* dst) { // Down-left
1392
// Note using the same shift trick as VE4() is slower here.
1393
const uint8x8_t ABCDEFGH = vld1_u8(dst - BPS + 0);
1394
const uint8x8_t BCDEFGH0 = vld1_u8(dst - BPS + 1);
1395
const uint8x8_t CDEFGH00 = vld1_u8(dst - BPS + 2);
1396
const uint8x8_t CDEFGHH0 = vset_lane_u8(dst[-BPS + 7], CDEFGH00, 6);
1397
const uint8x8_t avg1 = vhadd_u8(ABCDEFGH, CDEFGHH0);
1398
const uint8x8_t avg2 = vrhadd_u8(avg1, BCDEFGH0);
1399
const uint64x1_t avg2_u64 = vreinterpret_u64_u8(avg2);
1400
const uint32x2_t r0 = vreinterpret_u32_u8(avg2);
1401
const uint32x2_t r1 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 8));
1402
const uint32x2_t r2 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 16));
1403
const uint32x2_t r3 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 24));
1404
vst1_lane_u32((uint32_t*)(dst + 0 * BPS), r0, 0);
1405
vst1_lane_u32((uint32_t*)(dst + 1 * BPS), r1, 0);
1406
vst1_lane_u32((uint32_t*)(dst + 2 * BPS), r2, 0);
1407
vst1_lane_u32((uint32_t*)(dst + 3 * BPS), r3, 0);
1408
}
1409
1410
//------------------------------------------------------------------------------
1411
// Chroma
1412
1413
static void VE8uv_NEON(uint8_t* dst) { // vertical
1414
const uint8x8_t top = vld1_u8(dst - BPS);
1415
int j;
1416
for (j = 0; j < 8; ++j) {
1417
vst1_u8(dst + j * BPS, top);
1418
}
1419
}
1420
1421
static void HE8uv_NEON(uint8_t* dst) { // horizontal
1422
int j;
1423
for (j = 0; j < 8; ++j) {
1424
const uint8x8_t left = vld1_dup_u8(dst - 1);
1425
vst1_u8(dst, left);
1426
dst += BPS;
1427
}
1428
}
1429
1430
static WEBP_INLINE void DC8_NEON(uint8_t* dst, int do_top, int do_left) {
1431
uint16x8_t sum_top;
1432
uint16x8_t sum_left;
1433
uint8x8_t dc0;
1434
1435
if (do_top) {
1436
const uint8x8_t A = vld1_u8(dst - BPS); // top row
1437
#if WEBP_AARCH64
1438
const uint16_t p2 = vaddlv_u8(A);
1439
sum_top = vdupq_n_u16(p2);
1440
#else
1441
const uint16x4_t p0 = vpaddl_u8(A); // cascading summation of the top
1442
const uint16x4_t p1 = vpadd_u16(p0, p0);
1443
const uint16x4_t p2 = vpadd_u16(p1, p1);
1444
sum_top = vcombine_u16(p2, p2);
1445
#endif
1446
}
1447
1448
if (do_left) {
1449
const uint8x8_t L0 = vld1_u8(dst + 0 * BPS - 1);
1450
const uint8x8_t L1 = vld1_u8(dst + 1 * BPS - 1);
1451
const uint8x8_t L2 = vld1_u8(dst + 2 * BPS - 1);
1452
const uint8x8_t L3 = vld1_u8(dst + 3 * BPS - 1);
1453
const uint8x8_t L4 = vld1_u8(dst + 4 * BPS - 1);
1454
const uint8x8_t L5 = vld1_u8(dst + 5 * BPS - 1);
1455
const uint8x8_t L6 = vld1_u8(dst + 6 * BPS - 1);
1456
const uint8x8_t L7 = vld1_u8(dst + 7 * BPS - 1);
1457
const uint16x8_t s0 = vaddl_u8(L0, L1);
1458
const uint16x8_t s1 = vaddl_u8(L2, L3);
1459
const uint16x8_t s2 = vaddl_u8(L4, L5);
1460
const uint16x8_t s3 = vaddl_u8(L6, L7);
1461
const uint16x8_t s01 = vaddq_u16(s0, s1);
1462
const uint16x8_t s23 = vaddq_u16(s2, s3);
1463
sum_left = vaddq_u16(s01, s23);
1464
}
1465
1466
if (do_top && do_left) {
1467
const uint16x8_t sum = vaddq_u16(sum_left, sum_top);
1468
dc0 = vrshrn_n_u16(sum, 4);
1469
} else if (do_top) {
1470
dc0 = vrshrn_n_u16(sum_top, 3);
1471
} else if (do_left) {
1472
dc0 = vrshrn_n_u16(sum_left, 3);
1473
} else {
1474
dc0 = vdup_n_u8(0x80);
1475
}
1476
1477
{
1478
const uint8x8_t dc = vdup_lane_u8(dc0, 0);
1479
int i;
1480
for (i = 0; i < 8; ++i) {
1481
vst1_u32((uint32_t*)(dst + i * BPS), vreinterpret_u32_u8(dc));
1482
}
1483
}
1484
}
1485
1486
static void DC8uv_NEON(uint8_t* dst) { DC8_NEON(dst, 1, 1); }
1487
static void DC8uvNoTop_NEON(uint8_t* dst) { DC8_NEON(dst, 0, 1); }
1488
static void DC8uvNoLeft_NEON(uint8_t* dst) { DC8_NEON(dst, 1, 0); }
1489
static void DC8uvNoTopLeft_NEON(uint8_t* dst) { DC8_NEON(dst, 0, 0); }
1490
1491
static void TM8uv_NEON(uint8_t* dst) { TrueMotion_NEON(dst, 8); }
1492
1493
//------------------------------------------------------------------------------
1494
// 16x16
1495
1496
static void VE16_NEON(uint8_t* dst) { // vertical
1497
const uint8x16_t top = vld1q_u8(dst - BPS);
1498
int j;
1499
for (j = 0; j < 16; ++j) {
1500
vst1q_u8(dst + j * BPS, top);
1501
}
1502
}
1503
1504
static void HE16_NEON(uint8_t* dst) { // horizontal
1505
int j;
1506
for (j = 0; j < 16; ++j) {
1507
const uint8x16_t left = vld1q_dup_u8(dst - 1);
1508
vst1q_u8(dst, left);
1509
dst += BPS;
1510
}
1511
}
1512
1513
static WEBP_INLINE void DC16_NEON(uint8_t* dst, int do_top, int do_left) {
1514
uint16x8_t sum_top;
1515
uint16x8_t sum_left;
1516
uint8x8_t dc0;
1517
1518
if (do_top) {
1519
const uint8x16_t A = vld1q_u8(dst - BPS); // top row
1520
#if WEBP_AARCH64
1521
const uint16_t p3 = vaddlvq_u8(A);
1522
sum_top = vdupq_n_u16(p3);
1523
#else
1524
const uint16x8_t p0 = vpaddlq_u8(A); // cascading summation of the top
1525
const uint16x4_t p1 = vadd_u16(vget_low_u16(p0), vget_high_u16(p0));
1526
const uint16x4_t p2 = vpadd_u16(p1, p1);
1527
const uint16x4_t p3 = vpadd_u16(p2, p2);
1528
sum_top = vcombine_u16(p3, p3);
1529
#endif
1530
}
1531
1532
if (do_left) {
1533
int i;
1534
sum_left = vdupq_n_u16(0);
1535
for (i = 0; i < 16; i += 8) {
1536
const uint8x8_t L0 = vld1_u8(dst + (i + 0) * BPS - 1);
1537
const uint8x8_t L1 = vld1_u8(dst + (i + 1) * BPS - 1);
1538
const uint8x8_t L2 = vld1_u8(dst + (i + 2) * BPS - 1);
1539
const uint8x8_t L3 = vld1_u8(dst + (i + 3) * BPS - 1);
1540
const uint8x8_t L4 = vld1_u8(dst + (i + 4) * BPS - 1);
1541
const uint8x8_t L5 = vld1_u8(dst + (i + 5) * BPS - 1);
1542
const uint8x8_t L6 = vld1_u8(dst + (i + 6) * BPS - 1);
1543
const uint8x8_t L7 = vld1_u8(dst + (i + 7) * BPS - 1);
1544
const uint16x8_t s0 = vaddl_u8(L0, L1);
1545
const uint16x8_t s1 = vaddl_u8(L2, L3);
1546
const uint16x8_t s2 = vaddl_u8(L4, L5);
1547
const uint16x8_t s3 = vaddl_u8(L6, L7);
1548
const uint16x8_t s01 = vaddq_u16(s0, s1);
1549
const uint16x8_t s23 = vaddq_u16(s2, s3);
1550
const uint16x8_t sum = vaddq_u16(s01, s23);
1551
sum_left = vaddq_u16(sum_left, sum);
1552
}
1553
}
1554
1555
if (do_top && do_left) {
1556
const uint16x8_t sum = vaddq_u16(sum_left, sum_top);
1557
dc0 = vrshrn_n_u16(sum, 5);
1558
} else if (do_top) {
1559
dc0 = vrshrn_n_u16(sum_top, 4);
1560
} else if (do_left) {
1561
dc0 = vrshrn_n_u16(sum_left, 4);
1562
} else {
1563
dc0 = vdup_n_u8(0x80);
1564
}
1565
1566
{
1567
const uint8x16_t dc = vdupq_lane_u8(dc0, 0);
1568
int i;
1569
for (i = 0; i < 16; ++i) {
1570
vst1q_u8(dst + i * BPS, dc);
1571
}
1572
}
1573
}
1574
1575
static void DC16TopLeft_NEON(uint8_t* dst) { DC16_NEON(dst, 1, 1); }
1576
static void DC16NoTop_NEON(uint8_t* dst) { DC16_NEON(dst, 0, 1); }
1577
static void DC16NoLeft_NEON(uint8_t* dst) { DC16_NEON(dst, 1, 0); }
1578
static void DC16NoTopLeft_NEON(uint8_t* dst) { DC16_NEON(dst, 0, 0); }
1579
1580
static void TM16_NEON(uint8_t* dst) {
1581
const uint8x8_t TL = vld1_dup_u8(dst - BPS - 1); // top-left pixel 'A[-1]'
1582
const uint8x16_t T = vld1q_u8(dst - BPS); // top row 'A[0..15]'
1583
// A[c] - A[-1]
1584
const uint16x8_t d_lo = vsubl_u8(vget_low_u8(T), TL);
1585
const uint16x8_t d_hi = vsubl_u8(vget_high_u8(T), TL);
1586
int y;
1587
for (y = 0; y < 16; y += 4) {
1588
// left edge
1589
const uint8x8_t L0 = vld1_dup_u8(dst + 0 * BPS - 1);
1590
const uint8x8_t L1 = vld1_dup_u8(dst + 1 * BPS - 1);
1591
const uint8x8_t L2 = vld1_dup_u8(dst + 2 * BPS - 1);
1592
const uint8x8_t L3 = vld1_dup_u8(dst + 3 * BPS - 1);
1593
// L[r] + A[c] - A[-1]
1594
const int16x8_t r0_lo = vreinterpretq_s16_u16(vaddw_u8(d_lo, L0));
1595
const int16x8_t r1_lo = vreinterpretq_s16_u16(vaddw_u8(d_lo, L1));
1596
const int16x8_t r2_lo = vreinterpretq_s16_u16(vaddw_u8(d_lo, L2));
1597
const int16x8_t r3_lo = vreinterpretq_s16_u16(vaddw_u8(d_lo, L3));
1598
const int16x8_t r0_hi = vreinterpretq_s16_u16(vaddw_u8(d_hi, L0));
1599
const int16x8_t r1_hi = vreinterpretq_s16_u16(vaddw_u8(d_hi, L1));
1600
const int16x8_t r2_hi = vreinterpretq_s16_u16(vaddw_u8(d_hi, L2));
1601
const int16x8_t r3_hi = vreinterpretq_s16_u16(vaddw_u8(d_hi, L3));
1602
// Saturate and store the result.
1603
const uint8x16_t row0 = vcombine_u8(vqmovun_s16(r0_lo), vqmovun_s16(r0_hi));
1604
const uint8x16_t row1 = vcombine_u8(vqmovun_s16(r1_lo), vqmovun_s16(r1_hi));
1605
const uint8x16_t row2 = vcombine_u8(vqmovun_s16(r2_lo), vqmovun_s16(r2_hi));
1606
const uint8x16_t row3 = vcombine_u8(vqmovun_s16(r3_lo), vqmovun_s16(r3_hi));
1607
vst1q_u8(dst + 0 * BPS, row0);
1608
vst1q_u8(dst + 1 * BPS, row1);
1609
vst1q_u8(dst + 2 * BPS, row2);
1610
vst1q_u8(dst + 3 * BPS, row3);
1611
dst += 4 * BPS;
1612
}
1613
}
1614
1615
//------------------------------------------------------------------------------
1616
// Entry point
1617
1618
extern void VP8DspInitNEON(void);
1619
1620
WEBP_TSAN_IGNORE_FUNCTION void VP8DspInitNEON(void) {
1621
VP8Transform = TransformTwo_NEON;
1622
VP8TransformAC3 = TransformAC3_NEON;
1623
VP8TransformDC = TransformDC_NEON;
1624
VP8TransformWHT = TransformWHT_NEON;
1625
1626
VP8VFilter16 = VFilter16_NEON;
1627
VP8VFilter16i = VFilter16i_NEON;
1628
VP8HFilter16 = HFilter16_NEON;
1629
#if !defined(WORK_AROUND_GCC)
1630
VP8HFilter16i = HFilter16i_NEON;
1631
#endif
1632
VP8VFilter8 = VFilter8_NEON;
1633
VP8VFilter8i = VFilter8i_NEON;
1634
#if !defined(WORK_AROUND_GCC)
1635
VP8HFilter8 = HFilter8_NEON;
1636
VP8HFilter8i = HFilter8i_NEON;
1637
#endif
1638
VP8SimpleVFilter16 = SimpleVFilter16_NEON;
1639
VP8SimpleHFilter16 = SimpleHFilter16_NEON;
1640
VP8SimpleVFilter16i = SimpleVFilter16i_NEON;
1641
VP8SimpleHFilter16i = SimpleHFilter16i_NEON;
1642
1643
VP8PredLuma4[0] = DC4_NEON;
1644
VP8PredLuma4[1] = TM4_NEON;
1645
VP8PredLuma4[2] = VE4_NEON;
1646
VP8PredLuma4[4] = RD4_NEON;
1647
VP8PredLuma4[6] = LD4_NEON;
1648
1649
VP8PredLuma16[0] = DC16TopLeft_NEON;
1650
VP8PredLuma16[1] = TM16_NEON;
1651
VP8PredLuma16[2] = VE16_NEON;
1652
VP8PredLuma16[3] = HE16_NEON;
1653
VP8PredLuma16[4] = DC16NoTop_NEON;
1654
VP8PredLuma16[5] = DC16NoLeft_NEON;
1655
VP8PredLuma16[6] = DC16NoTopLeft_NEON;
1656
1657
VP8PredChroma8[0] = DC8uv_NEON;
1658
VP8PredChroma8[1] = TM8uv_NEON;
1659
VP8PredChroma8[2] = VE8uv_NEON;
1660
VP8PredChroma8[3] = HE8uv_NEON;
1661
VP8PredChroma8[4] = DC8uvNoTop_NEON;
1662
VP8PredChroma8[5] = DC8uvNoLeft_NEON;
1663
VP8PredChroma8[6] = DC8uvNoTopLeft_NEON;
1664
}
1665
1666
#else // !WEBP_USE_NEON
1667
1668
WEBP_DSP_INIT_STUB(VP8DspInitNEON)
1669
1670
#endif // WEBP_USE_NEON
1671
1672