Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
Tetragramm
GitHub Repository: Tetragramm/opencv
Path: blob/master/3rdparty/libwebp/src/dsp/dec_neon.c
16348 views
1
// Copyright 2012 Google Inc. All Rights Reserved.
2
//
3
// Use of this source code is governed by a BSD-style license
4
// that can be found in the COPYING file in the root of the source
5
// tree. An additional intellectual property rights grant can be found
6
// in the file PATENTS. All contributing project authors may
7
// be found in the AUTHORS file in the root of the source tree.
8
// -----------------------------------------------------------------------------
9
//
10
// ARM NEON version of dsp functions and loop filtering.
11
//
12
// Authors: Somnath Banerjee ([email protected])
13
// Johann Koenig ([email protected])
14
15
#include "src/dsp/dsp.h"
16
17
#if defined(WEBP_USE_NEON)
18
19
#include "src/dsp/neon.h"
20
#include "src/dec/vp8i_dec.h"
21
22
//------------------------------------------------------------------------------
23
// NxM Loading functions
24
25
#if !defined(WORK_AROUND_GCC)
26
27
// This intrinsics version makes gcc-4.6.3 crash during Load4x??() compilation
28
// (register alloc, probably). The variants somewhat mitigate the problem, but
29
// not quite. HFilter16i() remains problematic.
30
static WEBP_INLINE uint8x8x4_t Load4x8_NEON(const uint8_t* const src,
31
int stride) {
32
const uint8x8_t zero = vdup_n_u8(0);
33
uint8x8x4_t out;
34
INIT_VECTOR4(out, zero, zero, zero, zero);
35
out = vld4_lane_u8(src + 0 * stride, out, 0);
36
out = vld4_lane_u8(src + 1 * stride, out, 1);
37
out = vld4_lane_u8(src + 2 * stride, out, 2);
38
out = vld4_lane_u8(src + 3 * stride, out, 3);
39
out = vld4_lane_u8(src + 4 * stride, out, 4);
40
out = vld4_lane_u8(src + 5 * stride, out, 5);
41
out = vld4_lane_u8(src + 6 * stride, out, 6);
42
out = vld4_lane_u8(src + 7 * stride, out, 7);
43
return out;
44
}
45
46
static WEBP_INLINE void Load4x16_NEON(const uint8_t* const src, int stride,
47
uint8x16_t* const p1,
48
uint8x16_t* const p0,
49
uint8x16_t* const q0,
50
uint8x16_t* const q1) {
51
// row0 = p1[0..7]|p0[0..7]|q0[0..7]|q1[0..7]
52
// row8 = p1[8..15]|p0[8..15]|q0[8..15]|q1[8..15]
53
const uint8x8x4_t row0 = Load4x8_NEON(src - 2 + 0 * stride, stride);
54
const uint8x8x4_t row8 = Load4x8_NEON(src - 2 + 8 * stride, stride);
55
*p1 = vcombine_u8(row0.val[0], row8.val[0]);
56
*p0 = vcombine_u8(row0.val[1], row8.val[1]);
57
*q0 = vcombine_u8(row0.val[2], row8.val[2]);
58
*q1 = vcombine_u8(row0.val[3], row8.val[3]);
59
}
60
61
#else // WORK_AROUND_GCC
62
63
#define LOADQ_LANE_32b(VALUE, LANE) do { \
64
(VALUE) = vld1q_lane_u32((const uint32_t*)src, (VALUE), (LANE)); \
65
src += stride; \
66
} while (0)
67
68
static WEBP_INLINE void Load4x16_NEON(const uint8_t* src, int stride,
69
uint8x16_t* const p1,
70
uint8x16_t* const p0,
71
uint8x16_t* const q0,
72
uint8x16_t* const q1) {
73
const uint32x4_t zero = vdupq_n_u32(0);
74
uint32x4x4_t in;
75
INIT_VECTOR4(in, zero, zero, zero, zero);
76
src -= 2;
77
LOADQ_LANE_32b(in.val[0], 0);
78
LOADQ_LANE_32b(in.val[1], 0);
79
LOADQ_LANE_32b(in.val[2], 0);
80
LOADQ_LANE_32b(in.val[3], 0);
81
LOADQ_LANE_32b(in.val[0], 1);
82
LOADQ_LANE_32b(in.val[1], 1);
83
LOADQ_LANE_32b(in.val[2], 1);
84
LOADQ_LANE_32b(in.val[3], 1);
85
LOADQ_LANE_32b(in.val[0], 2);
86
LOADQ_LANE_32b(in.val[1], 2);
87
LOADQ_LANE_32b(in.val[2], 2);
88
LOADQ_LANE_32b(in.val[3], 2);
89
LOADQ_LANE_32b(in.val[0], 3);
90
LOADQ_LANE_32b(in.val[1], 3);
91
LOADQ_LANE_32b(in.val[2], 3);
92
LOADQ_LANE_32b(in.val[3], 3);
93
// Transpose four 4x4 parts:
94
{
95
const uint8x16x2_t row01 = vtrnq_u8(vreinterpretq_u8_u32(in.val[0]),
96
vreinterpretq_u8_u32(in.val[1]));
97
const uint8x16x2_t row23 = vtrnq_u8(vreinterpretq_u8_u32(in.val[2]),
98
vreinterpretq_u8_u32(in.val[3]));
99
const uint16x8x2_t row02 = vtrnq_u16(vreinterpretq_u16_u8(row01.val[0]),
100
vreinterpretq_u16_u8(row23.val[0]));
101
const uint16x8x2_t row13 = vtrnq_u16(vreinterpretq_u16_u8(row01.val[1]),
102
vreinterpretq_u16_u8(row23.val[1]));
103
*p1 = vreinterpretq_u8_u16(row02.val[0]);
104
*p0 = vreinterpretq_u8_u16(row13.val[0]);
105
*q0 = vreinterpretq_u8_u16(row02.val[1]);
106
*q1 = vreinterpretq_u8_u16(row13.val[1]);
107
}
108
}
109
#undef LOADQ_LANE_32b
110
111
#endif // !WORK_AROUND_GCC
112
113
static WEBP_INLINE void Load8x16_NEON(
114
const uint8_t* const src, int stride,
115
uint8x16_t* const p3, uint8x16_t* const p2, uint8x16_t* const p1,
116
uint8x16_t* const p0, uint8x16_t* const q0, uint8x16_t* const q1,
117
uint8x16_t* const q2, uint8x16_t* const q3) {
118
Load4x16_NEON(src - 2, stride, p3, p2, p1, p0);
119
Load4x16_NEON(src + 2, stride, q0, q1, q2, q3);
120
}
121
122
static WEBP_INLINE void Load16x4_NEON(const uint8_t* const src, int stride,
123
uint8x16_t* const p1,
124
uint8x16_t* const p0,
125
uint8x16_t* const q0,
126
uint8x16_t* const q1) {
127
*p1 = vld1q_u8(src - 2 * stride);
128
*p0 = vld1q_u8(src - 1 * stride);
129
*q0 = vld1q_u8(src + 0 * stride);
130
*q1 = vld1q_u8(src + 1 * stride);
131
}
132
133
static WEBP_INLINE void Load16x8_NEON(
134
const uint8_t* const src, int stride,
135
uint8x16_t* const p3, uint8x16_t* const p2, uint8x16_t* const p1,
136
uint8x16_t* const p0, uint8x16_t* const q0, uint8x16_t* const q1,
137
uint8x16_t* const q2, uint8x16_t* const q3) {
138
Load16x4_NEON(src - 2 * stride, stride, p3, p2, p1, p0);
139
Load16x4_NEON(src + 2 * stride, stride, q0, q1, q2, q3);
140
}
141
142
static WEBP_INLINE void Load8x8x2_NEON(
143
const uint8_t* const u, const uint8_t* const v, int stride,
144
uint8x16_t* const p3, uint8x16_t* const p2, uint8x16_t* const p1,
145
uint8x16_t* const p0, uint8x16_t* const q0, uint8x16_t* const q1,
146
uint8x16_t* const q2, uint8x16_t* const q3) {
147
// We pack the 8x8 u-samples in the lower half of the uint8x16_t destination
148
// and the v-samples on the higher half.
149
*p3 = vcombine_u8(vld1_u8(u - 4 * stride), vld1_u8(v - 4 * stride));
150
*p2 = vcombine_u8(vld1_u8(u - 3 * stride), vld1_u8(v - 3 * stride));
151
*p1 = vcombine_u8(vld1_u8(u - 2 * stride), vld1_u8(v - 2 * stride));
152
*p0 = vcombine_u8(vld1_u8(u - 1 * stride), vld1_u8(v - 1 * stride));
153
*q0 = vcombine_u8(vld1_u8(u + 0 * stride), vld1_u8(v + 0 * stride));
154
*q1 = vcombine_u8(vld1_u8(u + 1 * stride), vld1_u8(v + 1 * stride));
155
*q2 = vcombine_u8(vld1_u8(u + 2 * stride), vld1_u8(v + 2 * stride));
156
*q3 = vcombine_u8(vld1_u8(u + 3 * stride), vld1_u8(v + 3 * stride));
157
}
158
159
#if !defined(WORK_AROUND_GCC)
160
161
#define LOAD_UV_8(ROW) \
162
vcombine_u8(vld1_u8(u - 4 + (ROW) * stride), vld1_u8(v - 4 + (ROW) * stride))
163
164
static WEBP_INLINE void Load8x8x2T_NEON(
165
const uint8_t* const u, const uint8_t* const v, int stride,
166
uint8x16_t* const p3, uint8x16_t* const p2, uint8x16_t* const p1,
167
uint8x16_t* const p0, uint8x16_t* const q0, uint8x16_t* const q1,
168
uint8x16_t* const q2, uint8x16_t* const q3) {
169
// We pack the 8x8 u-samples in the lower half of the uint8x16_t destination
170
// and the v-samples on the higher half.
171
const uint8x16_t row0 = LOAD_UV_8(0);
172
const uint8x16_t row1 = LOAD_UV_8(1);
173
const uint8x16_t row2 = LOAD_UV_8(2);
174
const uint8x16_t row3 = LOAD_UV_8(3);
175
const uint8x16_t row4 = LOAD_UV_8(4);
176
const uint8x16_t row5 = LOAD_UV_8(5);
177
const uint8x16_t row6 = LOAD_UV_8(6);
178
const uint8x16_t row7 = LOAD_UV_8(7);
179
// Perform two side-by-side 8x8 transposes
180
// u00 u01 u02 u03 u04 u05 u06 u07 | v00 v01 v02 v03 v04 v05 v06 v07
181
// u10 u11 u12 u13 u14 u15 u16 u17 | v10 v11 v12 ...
182
// u20 u21 u22 u23 u24 u25 u26 u27 | v20 v21 ...
183
// u30 u31 u32 u33 u34 u35 u36 u37 | ...
184
// u40 u41 u42 u43 u44 u45 u46 u47 | ...
185
// u50 u51 u52 u53 u54 u55 u56 u57 | ...
186
// u60 u61 u62 u63 u64 u65 u66 u67 | v60 ...
187
// u70 u71 u72 u73 u74 u75 u76 u77 | v70 v71 v72 ...
188
const uint8x16x2_t row01 = vtrnq_u8(row0, row1); // u00 u10 u02 u12 ...
189
// u01 u11 u03 u13 ...
190
const uint8x16x2_t row23 = vtrnq_u8(row2, row3); // u20 u30 u22 u32 ...
191
// u21 u31 u23 u33 ...
192
const uint8x16x2_t row45 = vtrnq_u8(row4, row5); // ...
193
const uint8x16x2_t row67 = vtrnq_u8(row6, row7); // ...
194
const uint16x8x2_t row02 = vtrnq_u16(vreinterpretq_u16_u8(row01.val[0]),
195
vreinterpretq_u16_u8(row23.val[0]));
196
const uint16x8x2_t row13 = vtrnq_u16(vreinterpretq_u16_u8(row01.val[1]),
197
vreinterpretq_u16_u8(row23.val[1]));
198
const uint16x8x2_t row46 = vtrnq_u16(vreinterpretq_u16_u8(row45.val[0]),
199
vreinterpretq_u16_u8(row67.val[0]));
200
const uint16x8x2_t row57 = vtrnq_u16(vreinterpretq_u16_u8(row45.val[1]),
201
vreinterpretq_u16_u8(row67.val[1]));
202
const uint32x4x2_t row04 = vtrnq_u32(vreinterpretq_u32_u16(row02.val[0]),
203
vreinterpretq_u32_u16(row46.val[0]));
204
const uint32x4x2_t row26 = vtrnq_u32(vreinterpretq_u32_u16(row02.val[1]),
205
vreinterpretq_u32_u16(row46.val[1]));
206
const uint32x4x2_t row15 = vtrnq_u32(vreinterpretq_u32_u16(row13.val[0]),
207
vreinterpretq_u32_u16(row57.val[0]));
208
const uint32x4x2_t row37 = vtrnq_u32(vreinterpretq_u32_u16(row13.val[1]),
209
vreinterpretq_u32_u16(row57.val[1]));
210
*p3 = vreinterpretq_u8_u32(row04.val[0]);
211
*p2 = vreinterpretq_u8_u32(row15.val[0]);
212
*p1 = vreinterpretq_u8_u32(row26.val[0]);
213
*p0 = vreinterpretq_u8_u32(row37.val[0]);
214
*q0 = vreinterpretq_u8_u32(row04.val[1]);
215
*q1 = vreinterpretq_u8_u32(row15.val[1]);
216
*q2 = vreinterpretq_u8_u32(row26.val[1]);
217
*q3 = vreinterpretq_u8_u32(row37.val[1]);
218
}
219
#undef LOAD_UV_8
220
221
#endif // !WORK_AROUND_GCC
222
223
static WEBP_INLINE void Store2x8_NEON(const uint8x8x2_t v,
224
uint8_t* const dst, int stride) {
225
vst2_lane_u8(dst + 0 * stride, v, 0);
226
vst2_lane_u8(dst + 1 * stride, v, 1);
227
vst2_lane_u8(dst + 2 * stride, v, 2);
228
vst2_lane_u8(dst + 3 * stride, v, 3);
229
vst2_lane_u8(dst + 4 * stride, v, 4);
230
vst2_lane_u8(dst + 5 * stride, v, 5);
231
vst2_lane_u8(dst + 6 * stride, v, 6);
232
vst2_lane_u8(dst + 7 * stride, v, 7);
233
}
234
235
static WEBP_INLINE void Store2x16_NEON(const uint8x16_t p0, const uint8x16_t q0,
236
uint8_t* const dst, int stride) {
237
uint8x8x2_t lo, hi;
238
lo.val[0] = vget_low_u8(p0);
239
lo.val[1] = vget_low_u8(q0);
240
hi.val[0] = vget_high_u8(p0);
241
hi.val[1] = vget_high_u8(q0);
242
Store2x8_NEON(lo, dst - 1 + 0 * stride, stride);
243
Store2x8_NEON(hi, dst - 1 + 8 * stride, stride);
244
}
245
246
#if !defined(WORK_AROUND_GCC)
247
static WEBP_INLINE void Store4x8_NEON(const uint8x8x4_t v,
248
uint8_t* const dst, int stride) {
249
vst4_lane_u8(dst + 0 * stride, v, 0);
250
vst4_lane_u8(dst + 1 * stride, v, 1);
251
vst4_lane_u8(dst + 2 * stride, v, 2);
252
vst4_lane_u8(dst + 3 * stride, v, 3);
253
vst4_lane_u8(dst + 4 * stride, v, 4);
254
vst4_lane_u8(dst + 5 * stride, v, 5);
255
vst4_lane_u8(dst + 6 * stride, v, 6);
256
vst4_lane_u8(dst + 7 * stride, v, 7);
257
}
258
259
static WEBP_INLINE void Store4x16_NEON(const uint8x16_t p1, const uint8x16_t p0,
260
const uint8x16_t q0, const uint8x16_t q1,
261
uint8_t* const dst, int stride) {
262
uint8x8x4_t lo, hi;
263
INIT_VECTOR4(lo,
264
vget_low_u8(p1), vget_low_u8(p0),
265
vget_low_u8(q0), vget_low_u8(q1));
266
INIT_VECTOR4(hi,
267
vget_high_u8(p1), vget_high_u8(p0),
268
vget_high_u8(q0), vget_high_u8(q1));
269
Store4x8_NEON(lo, dst - 2 + 0 * stride, stride);
270
Store4x8_NEON(hi, dst - 2 + 8 * stride, stride);
271
}
272
#endif // !WORK_AROUND_GCC
273
274
static WEBP_INLINE void Store16x2_NEON(const uint8x16_t p0, const uint8x16_t q0,
275
uint8_t* const dst, int stride) {
276
vst1q_u8(dst - stride, p0);
277
vst1q_u8(dst, q0);
278
}
279
280
static WEBP_INLINE void Store16x4_NEON(const uint8x16_t p1, const uint8x16_t p0,
281
const uint8x16_t q0, const uint8x16_t q1,
282
uint8_t* const dst, int stride) {
283
Store16x2_NEON(p1, p0, dst - stride, stride);
284
Store16x2_NEON(q0, q1, dst + stride, stride);
285
}
286
287
static WEBP_INLINE void Store8x2x2_NEON(const uint8x16_t p0,
288
const uint8x16_t q0,
289
uint8_t* const u, uint8_t* const v,
290
int stride) {
291
// p0 and q0 contain the u+v samples packed in low/high halves.
292
vst1_u8(u - stride, vget_low_u8(p0));
293
vst1_u8(u, vget_low_u8(q0));
294
vst1_u8(v - stride, vget_high_u8(p0));
295
vst1_u8(v, vget_high_u8(q0));
296
}
297
298
static WEBP_INLINE void Store8x4x2_NEON(const uint8x16_t p1,
299
const uint8x16_t p0,
300
const uint8x16_t q0,
301
const uint8x16_t q1,
302
uint8_t* const u, uint8_t* const v,
303
int stride) {
304
// The p1...q1 registers contain the u+v samples packed in low/high halves.
305
Store8x2x2_NEON(p1, p0, u - stride, v - stride, stride);
306
Store8x2x2_NEON(q0, q1, u + stride, v + stride, stride);
307
}
308
309
#if !defined(WORK_AROUND_GCC)
310
311
#define STORE6_LANE(DST, VAL0, VAL1, LANE) do { \
312
vst3_lane_u8((DST) - 3, (VAL0), (LANE)); \
313
vst3_lane_u8((DST) + 0, (VAL1), (LANE)); \
314
(DST) += stride; \
315
} while (0)
316
317
static WEBP_INLINE void Store6x8x2_NEON(
318
const uint8x16_t p2, const uint8x16_t p1, const uint8x16_t p0,
319
const uint8x16_t q0, const uint8x16_t q1, const uint8x16_t q2,
320
uint8_t* u, uint8_t* v, int stride) {
321
uint8x8x3_t u0, u1, v0, v1;
322
INIT_VECTOR3(u0, vget_low_u8(p2), vget_low_u8(p1), vget_low_u8(p0));
323
INIT_VECTOR3(u1, vget_low_u8(q0), vget_low_u8(q1), vget_low_u8(q2));
324
INIT_VECTOR3(v0, vget_high_u8(p2), vget_high_u8(p1), vget_high_u8(p0));
325
INIT_VECTOR3(v1, vget_high_u8(q0), vget_high_u8(q1), vget_high_u8(q2));
326
STORE6_LANE(u, u0, u1, 0);
327
STORE6_LANE(u, u0, u1, 1);
328
STORE6_LANE(u, u0, u1, 2);
329
STORE6_LANE(u, u0, u1, 3);
330
STORE6_LANE(u, u0, u1, 4);
331
STORE6_LANE(u, u0, u1, 5);
332
STORE6_LANE(u, u0, u1, 6);
333
STORE6_LANE(u, u0, u1, 7);
334
STORE6_LANE(v, v0, v1, 0);
335
STORE6_LANE(v, v0, v1, 1);
336
STORE6_LANE(v, v0, v1, 2);
337
STORE6_LANE(v, v0, v1, 3);
338
STORE6_LANE(v, v0, v1, 4);
339
STORE6_LANE(v, v0, v1, 5);
340
STORE6_LANE(v, v0, v1, 6);
341
STORE6_LANE(v, v0, v1, 7);
342
}
343
#undef STORE6_LANE
344
345
static WEBP_INLINE void Store4x8x2_NEON(const uint8x16_t p1,
346
const uint8x16_t p0,
347
const uint8x16_t q0,
348
const uint8x16_t q1,
349
uint8_t* const u, uint8_t* const v,
350
int stride) {
351
uint8x8x4_t u0, v0;
352
INIT_VECTOR4(u0,
353
vget_low_u8(p1), vget_low_u8(p0),
354
vget_low_u8(q0), vget_low_u8(q1));
355
INIT_VECTOR4(v0,
356
vget_high_u8(p1), vget_high_u8(p0),
357
vget_high_u8(q0), vget_high_u8(q1));
358
vst4_lane_u8(u - 2 + 0 * stride, u0, 0);
359
vst4_lane_u8(u - 2 + 1 * stride, u0, 1);
360
vst4_lane_u8(u - 2 + 2 * stride, u0, 2);
361
vst4_lane_u8(u - 2 + 3 * stride, u0, 3);
362
vst4_lane_u8(u - 2 + 4 * stride, u0, 4);
363
vst4_lane_u8(u - 2 + 5 * stride, u0, 5);
364
vst4_lane_u8(u - 2 + 6 * stride, u0, 6);
365
vst4_lane_u8(u - 2 + 7 * stride, u0, 7);
366
vst4_lane_u8(v - 2 + 0 * stride, v0, 0);
367
vst4_lane_u8(v - 2 + 1 * stride, v0, 1);
368
vst4_lane_u8(v - 2 + 2 * stride, v0, 2);
369
vst4_lane_u8(v - 2 + 3 * stride, v0, 3);
370
vst4_lane_u8(v - 2 + 4 * stride, v0, 4);
371
vst4_lane_u8(v - 2 + 5 * stride, v0, 5);
372
vst4_lane_u8(v - 2 + 6 * stride, v0, 6);
373
vst4_lane_u8(v - 2 + 7 * stride, v0, 7);
374
}
375
376
#endif // !WORK_AROUND_GCC
377
378
// Zero extend 'v' to an int16x8_t.
379
static WEBP_INLINE int16x8_t ConvertU8ToS16_NEON(uint8x8_t v) {
380
return vreinterpretq_s16_u16(vmovl_u8(v));
381
}
382
383
// Performs unsigned 8b saturation on 'dst01' and 'dst23' storing the result
384
// to the corresponding rows of 'dst'.
385
static WEBP_INLINE void SaturateAndStore4x4_NEON(uint8_t* const dst,
386
const int16x8_t dst01,
387
const int16x8_t dst23) {
388
// Unsigned saturate to 8b.
389
const uint8x8_t dst01_u8 = vqmovun_s16(dst01);
390
const uint8x8_t dst23_u8 = vqmovun_s16(dst23);
391
392
// Store the results.
393
vst1_lane_u32((uint32_t*)(dst + 0 * BPS), vreinterpret_u32_u8(dst01_u8), 0);
394
vst1_lane_u32((uint32_t*)(dst + 1 * BPS), vreinterpret_u32_u8(dst01_u8), 1);
395
vst1_lane_u32((uint32_t*)(dst + 2 * BPS), vreinterpret_u32_u8(dst23_u8), 0);
396
vst1_lane_u32((uint32_t*)(dst + 3 * BPS), vreinterpret_u32_u8(dst23_u8), 1);
397
}
398
399
static WEBP_INLINE void Add4x4_NEON(const int16x8_t row01,
400
const int16x8_t row23,
401
uint8_t* const dst) {
402
uint32x2_t dst01 = vdup_n_u32(0);
403
uint32x2_t dst23 = vdup_n_u32(0);
404
405
// Load the source pixels.
406
dst01 = vld1_lane_u32((uint32_t*)(dst + 0 * BPS), dst01, 0);
407
dst23 = vld1_lane_u32((uint32_t*)(dst + 2 * BPS), dst23, 0);
408
dst01 = vld1_lane_u32((uint32_t*)(dst + 1 * BPS), dst01, 1);
409
dst23 = vld1_lane_u32((uint32_t*)(dst + 3 * BPS), dst23, 1);
410
411
{
412
// Convert to 16b.
413
const int16x8_t dst01_s16 = ConvertU8ToS16_NEON(vreinterpret_u8_u32(dst01));
414
const int16x8_t dst23_s16 = ConvertU8ToS16_NEON(vreinterpret_u8_u32(dst23));
415
416
// Descale with rounding.
417
const int16x8_t out01 = vrsraq_n_s16(dst01_s16, row01, 3);
418
const int16x8_t out23 = vrsraq_n_s16(dst23_s16, row23, 3);
419
// Add the inverse transform.
420
SaturateAndStore4x4_NEON(dst, out01, out23);
421
}
422
}
423
424
//-----------------------------------------------------------------------------
425
// Simple In-loop filtering (Paragraph 15.2)
426
427
static uint8x16_t NeedsFilter_NEON(const uint8x16_t p1, const uint8x16_t p0,
428
const uint8x16_t q0, const uint8x16_t q1,
429
int thresh) {
430
const uint8x16_t thresh_v = vdupq_n_u8((uint8_t)thresh);
431
const uint8x16_t a_p0_q0 = vabdq_u8(p0, q0); // abs(p0-q0)
432
const uint8x16_t a_p1_q1 = vabdq_u8(p1, q1); // abs(p1-q1)
433
const uint8x16_t a_p0_q0_2 = vqaddq_u8(a_p0_q0, a_p0_q0); // 2 * abs(p0-q0)
434
const uint8x16_t a_p1_q1_2 = vshrq_n_u8(a_p1_q1, 1); // abs(p1-q1) / 2
435
const uint8x16_t sum = vqaddq_u8(a_p0_q0_2, a_p1_q1_2);
436
const uint8x16_t mask = vcgeq_u8(thresh_v, sum);
437
return mask;
438
}
439
440
static int8x16_t FlipSign_NEON(const uint8x16_t v) {
441
const uint8x16_t sign_bit = vdupq_n_u8(0x80);
442
return vreinterpretq_s8_u8(veorq_u8(v, sign_bit));
443
}
444
445
static uint8x16_t FlipSignBack_NEON(const int8x16_t v) {
446
const int8x16_t sign_bit = vdupq_n_s8(0x80);
447
return vreinterpretq_u8_s8(veorq_s8(v, sign_bit));
448
}
449
450
static int8x16_t GetBaseDelta_NEON(const int8x16_t p1, const int8x16_t p0,
451
const int8x16_t q0, const int8x16_t q1) {
452
const int8x16_t q0_p0 = vqsubq_s8(q0, p0); // (q0-p0)
453
const int8x16_t p1_q1 = vqsubq_s8(p1, q1); // (p1-q1)
454
const int8x16_t s1 = vqaddq_s8(p1_q1, q0_p0); // (p1-q1) + 1 * (q0 - p0)
455
const int8x16_t s2 = vqaddq_s8(q0_p0, s1); // (p1-q1) + 2 * (q0 - p0)
456
const int8x16_t s3 = vqaddq_s8(q0_p0, s2); // (p1-q1) + 3 * (q0 - p0)
457
return s3;
458
}
459
460
static int8x16_t GetBaseDelta0_NEON(const int8x16_t p0, const int8x16_t q0) {
461
const int8x16_t q0_p0 = vqsubq_s8(q0, p0); // (q0-p0)
462
const int8x16_t s1 = vqaddq_s8(q0_p0, q0_p0); // 2 * (q0 - p0)
463
const int8x16_t s2 = vqaddq_s8(q0_p0, s1); // 3 * (q0 - p0)
464
return s2;
465
}
466
467
//------------------------------------------------------------------------------
468
469
static void ApplyFilter2NoFlip_NEON(const int8x16_t p0s, const int8x16_t q0s,
470
const int8x16_t delta,
471
int8x16_t* const op0,
472
int8x16_t* const oq0) {
473
const int8x16_t kCst3 = vdupq_n_s8(0x03);
474
const int8x16_t kCst4 = vdupq_n_s8(0x04);
475
const int8x16_t delta_p3 = vqaddq_s8(delta, kCst3);
476
const int8x16_t delta_p4 = vqaddq_s8(delta, kCst4);
477
const int8x16_t delta3 = vshrq_n_s8(delta_p3, 3);
478
const int8x16_t delta4 = vshrq_n_s8(delta_p4, 3);
479
*op0 = vqaddq_s8(p0s, delta3);
480
*oq0 = vqsubq_s8(q0s, delta4);
481
}
482
483
#if defined(WEBP_USE_INTRINSICS)
484
485
static void ApplyFilter2_NEON(const int8x16_t p0s, const int8x16_t q0s,
486
const int8x16_t delta,
487
uint8x16_t* const op0, uint8x16_t* const oq0) {
488
const int8x16_t kCst3 = vdupq_n_s8(0x03);
489
const int8x16_t kCst4 = vdupq_n_s8(0x04);
490
const int8x16_t delta_p3 = vqaddq_s8(delta, kCst3);
491
const int8x16_t delta_p4 = vqaddq_s8(delta, kCst4);
492
const int8x16_t delta3 = vshrq_n_s8(delta_p3, 3);
493
const int8x16_t delta4 = vshrq_n_s8(delta_p4, 3);
494
const int8x16_t sp0 = vqaddq_s8(p0s, delta3);
495
const int8x16_t sq0 = vqsubq_s8(q0s, delta4);
496
*op0 = FlipSignBack_NEON(sp0);
497
*oq0 = FlipSignBack_NEON(sq0);
498
}
499
500
static void DoFilter2_NEON(const uint8x16_t p1, const uint8x16_t p0,
501
const uint8x16_t q0, const uint8x16_t q1,
502
const uint8x16_t mask,
503
uint8x16_t* const op0, uint8x16_t* const oq0) {
504
const int8x16_t p1s = FlipSign_NEON(p1);
505
const int8x16_t p0s = FlipSign_NEON(p0);
506
const int8x16_t q0s = FlipSign_NEON(q0);
507
const int8x16_t q1s = FlipSign_NEON(q1);
508
const int8x16_t delta0 = GetBaseDelta_NEON(p1s, p0s, q0s, q1s);
509
const int8x16_t delta1 = vandq_s8(delta0, vreinterpretq_s8_u8(mask));
510
ApplyFilter2_NEON(p0s, q0s, delta1, op0, oq0);
511
}
512
513
static void SimpleVFilter16_NEON(uint8_t* p, int stride, int thresh) {
514
uint8x16_t p1, p0, q0, q1, op0, oq0;
515
Load16x4_NEON(p, stride, &p1, &p0, &q0, &q1);
516
{
517
const uint8x16_t mask = NeedsFilter_NEON(p1, p0, q0, q1, thresh);
518
DoFilter2_NEON(p1, p0, q0, q1, mask, &op0, &oq0);
519
}
520
Store16x2_NEON(op0, oq0, p, stride);
521
}
522
523
static void SimpleHFilter16_NEON(uint8_t* p, int stride, int thresh) {
524
uint8x16_t p1, p0, q0, q1, oq0, op0;
525
Load4x16_NEON(p, stride, &p1, &p0, &q0, &q1);
526
{
527
const uint8x16_t mask = NeedsFilter_NEON(p1, p0, q0, q1, thresh);
528
DoFilter2_NEON(p1, p0, q0, q1, mask, &op0, &oq0);
529
}
530
Store2x16_NEON(op0, oq0, p, stride);
531
}
532
533
#else
534
535
// Load/Store vertical edge
536
#define LOAD8x4(c1, c2, c3, c4, b1, b2, stride) \
537
"vld4.8 {" #c1 "[0]," #c2 "[0]," #c3 "[0]," #c4 "[0]}," #b1 "," #stride "\n" \
538
"vld4.8 {" #c1 "[1]," #c2 "[1]," #c3 "[1]," #c4 "[1]}," #b2 "," #stride "\n" \
539
"vld4.8 {" #c1 "[2]," #c2 "[2]," #c3 "[2]," #c4 "[2]}," #b1 "," #stride "\n" \
540
"vld4.8 {" #c1 "[3]," #c2 "[3]," #c3 "[3]," #c4 "[3]}," #b2 "," #stride "\n" \
541
"vld4.8 {" #c1 "[4]," #c2 "[4]," #c3 "[4]," #c4 "[4]}," #b1 "," #stride "\n" \
542
"vld4.8 {" #c1 "[5]," #c2 "[5]," #c3 "[5]," #c4 "[5]}," #b2 "," #stride "\n" \
543
"vld4.8 {" #c1 "[6]," #c2 "[6]," #c3 "[6]," #c4 "[6]}," #b1 "," #stride "\n" \
544
"vld4.8 {" #c1 "[7]," #c2 "[7]," #c3 "[7]," #c4 "[7]}," #b2 "," #stride "\n"
545
546
#define STORE8x2(c1, c2, p, stride) \
547
"vst2.8 {" #c1 "[0], " #c2 "[0]}," #p "," #stride " \n" \
548
"vst2.8 {" #c1 "[1], " #c2 "[1]}," #p "," #stride " \n" \
549
"vst2.8 {" #c1 "[2], " #c2 "[2]}," #p "," #stride " \n" \
550
"vst2.8 {" #c1 "[3], " #c2 "[3]}," #p "," #stride " \n" \
551
"vst2.8 {" #c1 "[4], " #c2 "[4]}," #p "," #stride " \n" \
552
"vst2.8 {" #c1 "[5], " #c2 "[5]}," #p "," #stride " \n" \
553
"vst2.8 {" #c1 "[6], " #c2 "[6]}," #p "," #stride " \n" \
554
"vst2.8 {" #c1 "[7], " #c2 "[7]}," #p "," #stride " \n"
555
556
#define QRegs "q0", "q1", "q2", "q3", \
557
"q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
558
559
#define FLIP_SIGN_BIT2(a, b, s) \
560
"veor " #a "," #a "," #s " \n" \
561
"veor " #b "," #b "," #s " \n" \
562
563
#define FLIP_SIGN_BIT4(a, b, c, d, s) \
564
FLIP_SIGN_BIT2(a, b, s) \
565
FLIP_SIGN_BIT2(c, d, s) \
566
567
#define NEEDS_FILTER(p1, p0, q0, q1, thresh, mask) \
568
"vabd.u8 q15," #p0 "," #q0 " \n" /* abs(p0 - q0) */ \
569
"vabd.u8 q14," #p1 "," #q1 " \n" /* abs(p1 - q1) */ \
570
"vqadd.u8 q15, q15, q15 \n" /* abs(p0 - q0) * 2 */ \
571
"vshr.u8 q14, q14, #1 \n" /* abs(p1 - q1) / 2 */ \
572
"vqadd.u8 q15, q15, q14 \n" /* abs(p0 - q0) * 2 + abs(p1 - q1) / 2 */ \
573
"vdup.8 q14, " #thresh " \n" \
574
"vcge.u8 " #mask ", q14, q15 \n" /* mask <= thresh */
575
576
#define GET_BASE_DELTA(p1, p0, q0, q1, o) \
577
"vqsub.s8 q15," #q0 "," #p0 " \n" /* (q0 - p0) */ \
578
"vqsub.s8 " #o "," #p1 "," #q1 " \n" /* (p1 - q1) */ \
579
"vqadd.s8 " #o "," #o ", q15 \n" /* (p1 - q1) + 1 * (p0 - q0) */ \
580
"vqadd.s8 " #o "," #o ", q15 \n" /* (p1 - q1) + 2 * (p0 - q0) */ \
581
"vqadd.s8 " #o "," #o ", q15 \n" /* (p1 - q1) + 3 * (p0 - q0) */
582
583
#define DO_SIMPLE_FILTER(p0, q0, fl) \
584
"vmov.i8 q15, #0x03 \n" \
585
"vqadd.s8 q15, q15, " #fl " \n" /* filter1 = filter + 3 */ \
586
"vshr.s8 q15, q15, #3 \n" /* filter1 >> 3 */ \
587
"vqadd.s8 " #p0 "," #p0 ", q15 \n" /* p0 += filter1 */ \
588
\
589
"vmov.i8 q15, #0x04 \n" \
590
"vqadd.s8 q15, q15, " #fl " \n" /* filter1 = filter + 4 */ \
591
"vshr.s8 q15, q15, #3 \n" /* filter2 >> 3 */ \
592
"vqsub.s8 " #q0 "," #q0 ", q15 \n" /* q0 -= filter2 */
593
594
// Applies filter on 2 pixels (p0 and q0)
595
#define DO_FILTER2(p1, p0, q0, q1, thresh) \
596
NEEDS_FILTER(p1, p0, q0, q1, thresh, q9) /* filter mask in q9 */ \
597
"vmov.i8 q10, #0x80 \n" /* sign bit */ \
598
FLIP_SIGN_BIT4(p1, p0, q0, q1, q10) /* convert to signed value */ \
599
GET_BASE_DELTA(p1, p0, q0, q1, q11) /* get filter level */ \
600
"vand q9, q9, q11 \n" /* apply filter mask */ \
601
DO_SIMPLE_FILTER(p0, q0, q9) /* apply filter */ \
602
FLIP_SIGN_BIT2(p0, q0, q10)
603
604
static void SimpleVFilter16_NEON(uint8_t* p, int stride, int thresh) {
605
__asm__ volatile (
606
"sub %[p], %[p], %[stride], lsl #1 \n" // p -= 2 * stride
607
608
"vld1.u8 {q1}, [%[p]], %[stride] \n" // p1
609
"vld1.u8 {q2}, [%[p]], %[stride] \n" // p0
610
"vld1.u8 {q3}, [%[p]], %[stride] \n" // q0
611
"vld1.u8 {q12}, [%[p]] \n" // q1
612
613
DO_FILTER2(q1, q2, q3, q12, %[thresh])
614
615
"sub %[p], %[p], %[stride], lsl #1 \n" // p -= 2 * stride
616
617
"vst1.u8 {q2}, [%[p]], %[stride] \n" // store op0
618
"vst1.u8 {q3}, [%[p]] \n" // store oq0
619
: [p] "+r"(p)
620
: [stride] "r"(stride), [thresh] "r"(thresh)
621
: "memory", QRegs
622
);
623
}
624
625
static void SimpleHFilter16_NEON(uint8_t* p, int stride, int thresh) {
626
__asm__ volatile (
627
"sub r4, %[p], #2 \n" // base1 = p - 2
628
"lsl r6, %[stride], #1 \n" // r6 = 2 * stride
629
"add r5, r4, %[stride] \n" // base2 = base1 + stride
630
631
LOAD8x4(d2, d3, d4, d5, [r4], [r5], r6)
632
LOAD8x4(d24, d25, d26, d27, [r4], [r5], r6)
633
"vswp d3, d24 \n" // p1:q1 p0:q3
634
"vswp d5, d26 \n" // q0:q2 q1:q4
635
"vswp q2, q12 \n" // p1:q1 p0:q2 q0:q3 q1:q4
636
637
DO_FILTER2(q1, q2, q12, q13, %[thresh])
638
639
"sub %[p], %[p], #1 \n" // p - 1
640
641
"vswp d5, d24 \n"
642
STORE8x2(d4, d5, [%[p]], %[stride])
643
STORE8x2(d24, d25, [%[p]], %[stride])
644
645
: [p] "+r"(p)
646
: [stride] "r"(stride), [thresh] "r"(thresh)
647
: "memory", "r4", "r5", "r6", QRegs
648
);
649
}
650
651
#undef LOAD8x4
652
#undef STORE8x2
653
654
#endif // WEBP_USE_INTRINSICS
655
656
static void SimpleVFilter16i_NEON(uint8_t* p, int stride, int thresh) {
657
uint32_t k;
658
for (k = 3; k != 0; --k) {
659
p += 4 * stride;
660
SimpleVFilter16_NEON(p, stride, thresh);
661
}
662
}
663
664
static void SimpleHFilter16i_NEON(uint8_t* p, int stride, int thresh) {
665
uint32_t k;
666
for (k = 3; k != 0; --k) {
667
p += 4;
668
SimpleHFilter16_NEON(p, stride, thresh);
669
}
670
}
671
672
//------------------------------------------------------------------------------
673
// Complex In-loop filtering (Paragraph 15.3)
674
675
static uint8x16_t NeedsHev_NEON(const uint8x16_t p1, const uint8x16_t p0,
676
const uint8x16_t q0, const uint8x16_t q1,
677
int hev_thresh) {
678
const uint8x16_t hev_thresh_v = vdupq_n_u8((uint8_t)hev_thresh);
679
const uint8x16_t a_p1_p0 = vabdq_u8(p1, p0); // abs(p1 - p0)
680
const uint8x16_t a_q1_q0 = vabdq_u8(q1, q0); // abs(q1 - q0)
681
const uint8x16_t a_max = vmaxq_u8(a_p1_p0, a_q1_q0);
682
const uint8x16_t mask = vcgtq_u8(a_max, hev_thresh_v);
683
return mask;
684
}
685
686
static uint8x16_t NeedsFilter2_NEON(const uint8x16_t p3, const uint8x16_t p2,
687
const uint8x16_t p1, const uint8x16_t p0,
688
const uint8x16_t q0, const uint8x16_t q1,
689
const uint8x16_t q2, const uint8x16_t q3,
690
int ithresh, int thresh) {
691
const uint8x16_t ithresh_v = vdupq_n_u8((uint8_t)ithresh);
692
const uint8x16_t a_p3_p2 = vabdq_u8(p3, p2); // abs(p3 - p2)
693
const uint8x16_t a_p2_p1 = vabdq_u8(p2, p1); // abs(p2 - p1)
694
const uint8x16_t a_p1_p0 = vabdq_u8(p1, p0); // abs(p1 - p0)
695
const uint8x16_t a_q3_q2 = vabdq_u8(q3, q2); // abs(q3 - q2)
696
const uint8x16_t a_q2_q1 = vabdq_u8(q2, q1); // abs(q2 - q1)
697
const uint8x16_t a_q1_q0 = vabdq_u8(q1, q0); // abs(q1 - q0)
698
const uint8x16_t max1 = vmaxq_u8(a_p3_p2, a_p2_p1);
699
const uint8x16_t max2 = vmaxq_u8(a_p1_p0, a_q3_q2);
700
const uint8x16_t max3 = vmaxq_u8(a_q2_q1, a_q1_q0);
701
const uint8x16_t max12 = vmaxq_u8(max1, max2);
702
const uint8x16_t max123 = vmaxq_u8(max12, max3);
703
const uint8x16_t mask2 = vcgeq_u8(ithresh_v, max123);
704
const uint8x16_t mask1 = NeedsFilter_NEON(p1, p0, q0, q1, thresh);
705
const uint8x16_t mask = vandq_u8(mask1, mask2);
706
return mask;
707
}
708
709
// 4-points filter
710
711
static void ApplyFilter4_NEON(
712
const int8x16_t p1, const int8x16_t p0,
713
const int8x16_t q0, const int8x16_t q1,
714
const int8x16_t delta0,
715
uint8x16_t* const op1, uint8x16_t* const op0,
716
uint8x16_t* const oq0, uint8x16_t* const oq1) {
717
const int8x16_t kCst3 = vdupq_n_s8(0x03);
718
const int8x16_t kCst4 = vdupq_n_s8(0x04);
719
const int8x16_t delta1 = vqaddq_s8(delta0, kCst4);
720
const int8x16_t delta2 = vqaddq_s8(delta0, kCst3);
721
const int8x16_t a1 = vshrq_n_s8(delta1, 3);
722
const int8x16_t a2 = vshrq_n_s8(delta2, 3);
723
const int8x16_t a3 = vrshrq_n_s8(a1, 1); // a3 = (a1 + 1) >> 1
724
*op0 = FlipSignBack_NEON(vqaddq_s8(p0, a2)); // clip(p0 + a2)
725
*oq0 = FlipSignBack_NEON(vqsubq_s8(q0, a1)); // clip(q0 - a1)
726
*op1 = FlipSignBack_NEON(vqaddq_s8(p1, a3)); // clip(p1 + a3)
727
*oq1 = FlipSignBack_NEON(vqsubq_s8(q1, a3)); // clip(q1 - a3)
728
}
729
730
static void DoFilter4_NEON(
731
const uint8x16_t p1, const uint8x16_t p0,
732
const uint8x16_t q0, const uint8x16_t q1,
733
const uint8x16_t mask, const uint8x16_t hev_mask,
734
uint8x16_t* const op1, uint8x16_t* const op0,
735
uint8x16_t* const oq0, uint8x16_t* const oq1) {
736
// This is a fused version of DoFilter2() calling ApplyFilter2 directly
737
const int8x16_t p1s = FlipSign_NEON(p1);
738
int8x16_t p0s = FlipSign_NEON(p0);
739
int8x16_t q0s = FlipSign_NEON(q0);
740
const int8x16_t q1s = FlipSign_NEON(q1);
741
const uint8x16_t simple_lf_mask = vandq_u8(mask, hev_mask);
742
743
// do_filter2 part (simple loopfilter on pixels with hev)
744
{
745
const int8x16_t delta = GetBaseDelta_NEON(p1s, p0s, q0s, q1s);
746
const int8x16_t simple_lf_delta =
747
vandq_s8(delta, vreinterpretq_s8_u8(simple_lf_mask));
748
ApplyFilter2NoFlip_NEON(p0s, q0s, simple_lf_delta, &p0s, &q0s);
749
}
750
751
// do_filter4 part (complex loopfilter on pixels without hev)
752
{
753
const int8x16_t delta0 = GetBaseDelta0_NEON(p0s, q0s);
754
// we use: (mask & hev_mask) ^ mask = mask & !hev_mask
755
const uint8x16_t complex_lf_mask = veorq_u8(simple_lf_mask, mask);
756
const int8x16_t complex_lf_delta =
757
vandq_s8(delta0, vreinterpretq_s8_u8(complex_lf_mask));
758
ApplyFilter4_NEON(p1s, p0s, q0s, q1s, complex_lf_delta, op1, op0, oq0, oq1);
759
}
760
}
761
762
// 6-points filter
763
764
static void ApplyFilter6_NEON(
765
const int8x16_t p2, const int8x16_t p1, const int8x16_t p0,
766
const int8x16_t q0, const int8x16_t q1, const int8x16_t q2,
767
const int8x16_t delta,
768
uint8x16_t* const op2, uint8x16_t* const op1, uint8x16_t* const op0,
769
uint8x16_t* const oq0, uint8x16_t* const oq1, uint8x16_t* const oq2) {
770
// We have to compute: X = (9*a+63) >> 7, Y = (18*a+63)>>7, Z = (27*a+63) >> 7
771
// Turns out, there's a common sub-expression S=9 * a - 1 that can be used
772
// with the special vqrshrn_n_s16 rounding-shift-and-narrow instruction:
773
// X = (S + 64) >> 7, Y = (S + 32) >> 6, Z = (18 * a + S + 64) >> 7
774
const int8x8_t delta_lo = vget_low_s8(delta);
775
const int8x8_t delta_hi = vget_high_s8(delta);
776
const int8x8_t kCst9 = vdup_n_s8(9);
777
const int16x8_t kCstm1 = vdupq_n_s16(-1);
778
const int8x8_t kCst18 = vdup_n_s8(18);
779
const int16x8_t S_lo = vmlal_s8(kCstm1, kCst9, delta_lo); // S = 9 * a - 1
780
const int16x8_t S_hi = vmlal_s8(kCstm1, kCst9, delta_hi);
781
const int16x8_t Z_lo = vmlal_s8(S_lo, kCst18, delta_lo); // S + 18 * a
782
const int16x8_t Z_hi = vmlal_s8(S_hi, kCst18, delta_hi);
783
const int8x8_t a3_lo = vqrshrn_n_s16(S_lo, 7); // (9 * a + 63) >> 7
784
const int8x8_t a3_hi = vqrshrn_n_s16(S_hi, 7);
785
const int8x8_t a2_lo = vqrshrn_n_s16(S_lo, 6); // (9 * a + 31) >> 6
786
const int8x8_t a2_hi = vqrshrn_n_s16(S_hi, 6);
787
const int8x8_t a1_lo = vqrshrn_n_s16(Z_lo, 7); // (27 * a + 63) >> 7
788
const int8x8_t a1_hi = vqrshrn_n_s16(Z_hi, 7);
789
const int8x16_t a1 = vcombine_s8(a1_lo, a1_hi);
790
const int8x16_t a2 = vcombine_s8(a2_lo, a2_hi);
791
const int8x16_t a3 = vcombine_s8(a3_lo, a3_hi);
792
793
*op0 = FlipSignBack_NEON(vqaddq_s8(p0, a1)); // clip(p0 + a1)
794
*oq0 = FlipSignBack_NEON(vqsubq_s8(q0, a1)); // clip(q0 - q1)
795
*oq1 = FlipSignBack_NEON(vqsubq_s8(q1, a2)); // clip(q1 - a2)
796
*op1 = FlipSignBack_NEON(vqaddq_s8(p1, a2)); // clip(p1 + a2)
797
*oq2 = FlipSignBack_NEON(vqsubq_s8(q2, a3)); // clip(q2 - a3)
798
*op2 = FlipSignBack_NEON(vqaddq_s8(p2, a3)); // clip(p2 + a3)
799
}
800
801
static void DoFilter6_NEON(
802
const uint8x16_t p2, const uint8x16_t p1, const uint8x16_t p0,
803
const uint8x16_t q0, const uint8x16_t q1, const uint8x16_t q2,
804
const uint8x16_t mask, const uint8x16_t hev_mask,
805
uint8x16_t* const op2, uint8x16_t* const op1, uint8x16_t* const op0,
806
uint8x16_t* const oq0, uint8x16_t* const oq1, uint8x16_t* const oq2) {
807
// This is a fused version of DoFilter2() calling ApplyFilter2 directly
808
const int8x16_t p2s = FlipSign_NEON(p2);
809
const int8x16_t p1s = FlipSign_NEON(p1);
810
int8x16_t p0s = FlipSign_NEON(p0);
811
int8x16_t q0s = FlipSign_NEON(q0);
812
const int8x16_t q1s = FlipSign_NEON(q1);
813
const int8x16_t q2s = FlipSign_NEON(q2);
814
const uint8x16_t simple_lf_mask = vandq_u8(mask, hev_mask);
815
const int8x16_t delta0 = GetBaseDelta_NEON(p1s, p0s, q0s, q1s);
816
817
// do_filter2 part (simple loopfilter on pixels with hev)
818
{
819
const int8x16_t simple_lf_delta =
820
vandq_s8(delta0, vreinterpretq_s8_u8(simple_lf_mask));
821
ApplyFilter2NoFlip_NEON(p0s, q0s, simple_lf_delta, &p0s, &q0s);
822
}
823
824
// do_filter6 part (complex loopfilter on pixels without hev)
825
{
826
// we use: (mask & hev_mask) ^ mask = mask & !hev_mask
827
const uint8x16_t complex_lf_mask = veorq_u8(simple_lf_mask, mask);
828
const int8x16_t complex_lf_delta =
829
vandq_s8(delta0, vreinterpretq_s8_u8(complex_lf_mask));
830
ApplyFilter6_NEON(p2s, p1s, p0s, q0s, q1s, q2s, complex_lf_delta,
831
op2, op1, op0, oq0, oq1, oq2);
832
}
833
}
834
835
// on macroblock edges
836
837
static void VFilter16_NEON(uint8_t* p, int stride,
838
int thresh, int ithresh, int hev_thresh) {
839
uint8x16_t p3, p2, p1, p0, q0, q1, q2, q3;
840
Load16x8_NEON(p, stride, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
841
{
842
const uint8x16_t mask = NeedsFilter2_NEON(p3, p2, p1, p0, q0, q1, q2, q3,
843
ithresh, thresh);
844
const uint8x16_t hev_mask = NeedsHev_NEON(p1, p0, q0, q1, hev_thresh);
845
uint8x16_t op2, op1, op0, oq0, oq1, oq2;
846
DoFilter6_NEON(p2, p1, p0, q0, q1, q2, mask, hev_mask,
847
&op2, &op1, &op0, &oq0, &oq1, &oq2);
848
Store16x2_NEON(op2, op1, p - 2 * stride, stride);
849
Store16x2_NEON(op0, oq0, p + 0 * stride, stride);
850
Store16x2_NEON(oq1, oq2, p + 2 * stride, stride);
851
}
852
}
853
854
static void HFilter16_NEON(uint8_t* p, int stride,
855
int thresh, int ithresh, int hev_thresh) {
856
uint8x16_t p3, p2, p1, p0, q0, q1, q2, q3;
857
Load8x16_NEON(p, stride, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
858
{
859
const uint8x16_t mask = NeedsFilter2_NEON(p3, p2, p1, p0, q0, q1, q2, q3,
860
ithresh, thresh);
861
const uint8x16_t hev_mask = NeedsHev_NEON(p1, p0, q0, q1, hev_thresh);
862
uint8x16_t op2, op1, op0, oq0, oq1, oq2;
863
DoFilter6_NEON(p2, p1, p0, q0, q1, q2, mask, hev_mask,
864
&op2, &op1, &op0, &oq0, &oq1, &oq2);
865
Store2x16_NEON(op2, op1, p - 2, stride);
866
Store2x16_NEON(op0, oq0, p + 0, stride);
867
Store2x16_NEON(oq1, oq2, p + 2, stride);
868
}
869
}
870
871
// on three inner edges
872
static void VFilter16i_NEON(uint8_t* p, int stride,
873
int thresh, int ithresh, int hev_thresh) {
874
uint32_t k;
875
uint8x16_t p3, p2, p1, p0;
876
Load16x4_NEON(p + 2 * stride, stride, &p3, &p2, &p1, &p0);
877
for (k = 3; k != 0; --k) {
878
uint8x16_t q0, q1, q2, q3;
879
p += 4 * stride;
880
Load16x4_NEON(p + 2 * stride, stride, &q0, &q1, &q2, &q3);
881
{
882
const uint8x16_t mask =
883
NeedsFilter2_NEON(p3, p2, p1, p0, q0, q1, q2, q3, ithresh, thresh);
884
const uint8x16_t hev_mask = NeedsHev_NEON(p1, p0, q0, q1, hev_thresh);
885
// p3 and p2 are not just temporary variables here: they will be
886
// re-used for next span. And q2/q3 will become p1/p0 accordingly.
887
DoFilter4_NEON(p1, p0, q0, q1, mask, hev_mask, &p1, &p0, &p3, &p2);
888
Store16x4_NEON(p1, p0, p3, p2, p, stride);
889
p1 = q2;
890
p0 = q3;
891
}
892
}
893
}
894
895
#if !defined(WORK_AROUND_GCC)
896
static void HFilter16i_NEON(uint8_t* p, int stride,
897
int thresh, int ithresh, int hev_thresh) {
898
uint32_t k;
899
uint8x16_t p3, p2, p1, p0;
900
Load4x16_NEON(p + 2, stride, &p3, &p2, &p1, &p0);
901
for (k = 3; k != 0; --k) {
902
uint8x16_t q0, q1, q2, q3;
903
p += 4;
904
Load4x16_NEON(p + 2, stride, &q0, &q1, &q2, &q3);
905
{
906
const uint8x16_t mask =
907
NeedsFilter2_NEON(p3, p2, p1, p0, q0, q1, q2, q3, ithresh, thresh);
908
const uint8x16_t hev_mask = NeedsHev_NEON(p1, p0, q0, q1, hev_thresh);
909
DoFilter4_NEON(p1, p0, q0, q1, mask, hev_mask, &p1, &p0, &p3, &p2);
910
Store4x16_NEON(p1, p0, p3, p2, p, stride);
911
p1 = q2;
912
p0 = q3;
913
}
914
}
915
}
916
#endif // !WORK_AROUND_GCC
917
918
// 8-pixels wide variant, for chroma filtering
919
static void VFilter8_NEON(uint8_t* u, uint8_t* v, int stride,
920
int thresh, int ithresh, int hev_thresh) {
921
uint8x16_t p3, p2, p1, p0, q0, q1, q2, q3;
922
Load8x8x2_NEON(u, v, stride, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
923
{
924
const uint8x16_t mask = NeedsFilter2_NEON(p3, p2, p1, p0, q0, q1, q2, q3,
925
ithresh, thresh);
926
const uint8x16_t hev_mask = NeedsHev_NEON(p1, p0, q0, q1, hev_thresh);
927
uint8x16_t op2, op1, op0, oq0, oq1, oq2;
928
DoFilter6_NEON(p2, p1, p0, q0, q1, q2, mask, hev_mask,
929
&op2, &op1, &op0, &oq0, &oq1, &oq2);
930
Store8x2x2_NEON(op2, op1, u - 2 * stride, v - 2 * stride, stride);
931
Store8x2x2_NEON(op0, oq0, u + 0 * stride, v + 0 * stride, stride);
932
Store8x2x2_NEON(oq1, oq2, u + 2 * stride, v + 2 * stride, stride);
933
}
934
}
935
static void VFilter8i_NEON(uint8_t* u, uint8_t* v, int stride,
936
int thresh, int ithresh, int hev_thresh) {
937
uint8x16_t p3, p2, p1, p0, q0, q1, q2, q3;
938
u += 4 * stride;
939
v += 4 * stride;
940
Load8x8x2_NEON(u, v, stride, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
941
{
942
const uint8x16_t mask = NeedsFilter2_NEON(p3, p2, p1, p0, q0, q1, q2, q3,
943
ithresh, thresh);
944
const uint8x16_t hev_mask = NeedsHev_NEON(p1, p0, q0, q1, hev_thresh);
945
uint8x16_t op1, op0, oq0, oq1;
946
DoFilter4_NEON(p1, p0, q0, q1, mask, hev_mask, &op1, &op0, &oq0, &oq1);
947
Store8x4x2_NEON(op1, op0, oq0, oq1, u, v, stride);
948
}
949
}
950
951
#if !defined(WORK_AROUND_GCC)
952
static void HFilter8_NEON(uint8_t* u, uint8_t* v, int stride,
953
int thresh, int ithresh, int hev_thresh) {
954
uint8x16_t p3, p2, p1, p0, q0, q1, q2, q3;
955
Load8x8x2T_NEON(u, v, stride, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
956
{
957
const uint8x16_t mask = NeedsFilter2_NEON(p3, p2, p1, p0, q0, q1, q2, q3,
958
ithresh, thresh);
959
const uint8x16_t hev_mask = NeedsHev_NEON(p1, p0, q0, q1, hev_thresh);
960
uint8x16_t op2, op1, op0, oq0, oq1, oq2;
961
DoFilter6_NEON(p2, p1, p0, q0, q1, q2, mask, hev_mask,
962
&op2, &op1, &op0, &oq0, &oq1, &oq2);
963
Store6x8x2_NEON(op2, op1, op0, oq0, oq1, oq2, u, v, stride);
964
}
965
}
966
967
static void HFilter8i_NEON(uint8_t* u, uint8_t* v, int stride,
968
int thresh, int ithresh, int hev_thresh) {
969
uint8x16_t p3, p2, p1, p0, q0, q1, q2, q3;
970
u += 4;
971
v += 4;
972
Load8x8x2T_NEON(u, v, stride, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
973
{
974
const uint8x16_t mask = NeedsFilter2_NEON(p3, p2, p1, p0, q0, q1, q2, q3,
975
ithresh, thresh);
976
const uint8x16_t hev_mask = NeedsHev_NEON(p1, p0, q0, q1, hev_thresh);
977
uint8x16_t op1, op0, oq0, oq1;
978
DoFilter4_NEON(p1, p0, q0, q1, mask, hev_mask, &op1, &op0, &oq0, &oq1);
979
Store4x8x2_NEON(op1, op0, oq0, oq1, u, v, stride);
980
}
981
}
982
#endif // !WORK_AROUND_GCC
983
984
//-----------------------------------------------------------------------------
985
// Inverse transforms (Paragraph 14.4)
986
987
// Technically these are unsigned but vqdmulh is only available in signed.
988
// vqdmulh returns high half (effectively >> 16) but also doubles the value,
989
// changing the >> 16 to >> 15 and requiring an additional >> 1.
990
// We use this to our advantage with kC2. The canonical value is 35468.
991
// However, the high bit is set so treating it as signed will give incorrect
992
// results. We avoid this by down shifting by 1 here to clear the highest bit.
993
// Combined with the doubling effect of vqdmulh we get >> 16.
994
// This can not be applied to kC1 because the lowest bit is set. Down shifting
995
// the constant would reduce precision.
996
997
// libwebp uses a trick to avoid some extra addition that libvpx does.
998
// Instead of:
999
// temp2 = ip[12] + ((ip[12] * cospi8sqrt2minus1) >> 16);
1000
// libwebp adds 1 << 16 to cospi8sqrt2minus1 (kC1). However, this causes the
1001
// same issue with kC1 and vqdmulh that we work around by down shifting kC2
1002
1003
static const int16_t kC1 = 20091;
1004
static const int16_t kC2 = 17734; // half of kC2, actually. See comment above.
1005
1006
#if defined(WEBP_USE_INTRINSICS)
1007
static WEBP_INLINE void Transpose8x2_NEON(const int16x8_t in0,
1008
const int16x8_t in1,
1009
int16x8x2_t* const out) {
1010
// a0 a1 a2 a3 | b0 b1 b2 b3 => a0 b0 c0 d0 | a1 b1 c1 d1
1011
// c0 c1 c2 c3 | d0 d1 d2 d3 a2 b2 c2 d2 | a3 b3 c3 d3
1012
const int16x8x2_t tmp0 = vzipq_s16(in0, in1); // a0 c0 a1 c1 a2 c2 ...
1013
// b0 d0 b1 d1 b2 d2 ...
1014
*out = vzipq_s16(tmp0.val[0], tmp0.val[1]);
1015
}
1016
1017
static WEBP_INLINE void TransformPass_NEON(int16x8x2_t* const rows) {
1018
// {rows} = in0 | in4
1019
// in8 | in12
1020
// B1 = in4 | in12
1021
const int16x8_t B1 =
1022
vcombine_s16(vget_high_s16(rows->val[0]), vget_high_s16(rows->val[1]));
1023
// C0 = kC1 * in4 | kC1 * in12
1024
// C1 = kC2 * in4 | kC2 * in12
1025
const int16x8_t C0 = vsraq_n_s16(B1, vqdmulhq_n_s16(B1, kC1), 1);
1026
const int16x8_t C1 = vqdmulhq_n_s16(B1, kC2);
1027
const int16x4_t a = vqadd_s16(vget_low_s16(rows->val[0]),
1028
vget_low_s16(rows->val[1])); // in0 + in8
1029
const int16x4_t b = vqsub_s16(vget_low_s16(rows->val[0]),
1030
vget_low_s16(rows->val[1])); // in0 - in8
1031
// c = kC2 * in4 - kC1 * in12
1032
// d = kC1 * in4 + kC2 * in12
1033
const int16x4_t c = vqsub_s16(vget_low_s16(C1), vget_high_s16(C0));
1034
const int16x4_t d = vqadd_s16(vget_low_s16(C0), vget_high_s16(C1));
1035
const int16x8_t D0 = vcombine_s16(a, b); // D0 = a | b
1036
const int16x8_t D1 = vcombine_s16(d, c); // D1 = d | c
1037
const int16x8_t E0 = vqaddq_s16(D0, D1); // a+d | b+c
1038
const int16x8_t E_tmp = vqsubq_s16(D0, D1); // a-d | b-c
1039
const int16x8_t E1 = vcombine_s16(vget_high_s16(E_tmp), vget_low_s16(E_tmp));
1040
Transpose8x2_NEON(E0, E1, rows);
1041
}
1042
1043
static void TransformOne_NEON(const int16_t* in, uint8_t* dst) {
1044
int16x8x2_t rows;
1045
INIT_VECTOR2(rows, vld1q_s16(in + 0), vld1q_s16(in + 8));
1046
TransformPass_NEON(&rows);
1047
TransformPass_NEON(&rows);
1048
Add4x4_NEON(rows.val[0], rows.val[1], dst);
1049
}
1050
1051
#else
1052
1053
static void TransformOne_NEON(const int16_t* in, uint8_t* dst) {
1054
const int kBPS = BPS;
1055
// kC1, kC2. Padded because vld1.16 loads 8 bytes
1056
const int16_t constants[4] = { kC1, kC2, 0, 0 };
1057
/* Adapted from libvpx: vp8/common/arm/neon/shortidct4x4llm_neon.asm */
1058
__asm__ volatile (
1059
"vld1.16 {q1, q2}, [%[in]] \n"
1060
"vld1.16 {d0}, [%[constants]] \n"
1061
1062
/* d2: in[0]
1063
* d3: in[8]
1064
* d4: in[4]
1065
* d5: in[12]
1066
*/
1067
"vswp d3, d4 \n"
1068
1069
/* q8 = {in[4], in[12]} * kC1 * 2 >> 16
1070
* q9 = {in[4], in[12]} * kC2 >> 16
1071
*/
1072
"vqdmulh.s16 q8, q2, d0[0] \n"
1073
"vqdmulh.s16 q9, q2, d0[1] \n"
1074
1075
/* d22 = a = in[0] + in[8]
1076
* d23 = b = in[0] - in[8]
1077
*/
1078
"vqadd.s16 d22, d2, d3 \n"
1079
"vqsub.s16 d23, d2, d3 \n"
1080
1081
/* The multiplication should be x * kC1 >> 16
1082
* However, with vqdmulh we get x * kC1 * 2 >> 16
1083
* (multiply, double, return high half)
1084
* We avoided this in kC2 by pre-shifting the constant.
1085
* q8 = in[4]/[12] * kC1 >> 16
1086
*/
1087
"vshr.s16 q8, q8, #1 \n"
1088
1089
/* Add {in[4], in[12]} back after the multiplication. This is handled by
1090
* adding 1 << 16 to kC1 in the libwebp C code.
1091
*/
1092
"vqadd.s16 q8, q2, q8 \n"
1093
1094
/* d20 = c = in[4]*kC2 - in[12]*kC1
1095
* d21 = d = in[4]*kC1 + in[12]*kC2
1096
*/
1097
"vqsub.s16 d20, d18, d17 \n"
1098
"vqadd.s16 d21, d19, d16 \n"
1099
1100
/* d2 = tmp[0] = a + d
1101
* d3 = tmp[1] = b + c
1102
* d4 = tmp[2] = b - c
1103
* d5 = tmp[3] = a - d
1104
*/
1105
"vqadd.s16 d2, d22, d21 \n"
1106
"vqadd.s16 d3, d23, d20 \n"
1107
"vqsub.s16 d4, d23, d20 \n"
1108
"vqsub.s16 d5, d22, d21 \n"
1109
1110
"vzip.16 q1, q2 \n"
1111
"vzip.16 q1, q2 \n"
1112
1113
"vswp d3, d4 \n"
1114
1115
/* q8 = {tmp[4], tmp[12]} * kC1 * 2 >> 16
1116
* q9 = {tmp[4], tmp[12]} * kC2 >> 16
1117
*/
1118
"vqdmulh.s16 q8, q2, d0[0] \n"
1119
"vqdmulh.s16 q9, q2, d0[1] \n"
1120
1121
/* d22 = a = tmp[0] + tmp[8]
1122
* d23 = b = tmp[0] - tmp[8]
1123
*/
1124
"vqadd.s16 d22, d2, d3 \n"
1125
"vqsub.s16 d23, d2, d3 \n"
1126
1127
/* See long winded explanations prior */
1128
"vshr.s16 q8, q8, #1 \n"
1129
"vqadd.s16 q8, q2, q8 \n"
1130
1131
/* d20 = c = in[4]*kC2 - in[12]*kC1
1132
* d21 = d = in[4]*kC1 + in[12]*kC2
1133
*/
1134
"vqsub.s16 d20, d18, d17 \n"
1135
"vqadd.s16 d21, d19, d16 \n"
1136
1137
/* d2 = tmp[0] = a + d
1138
* d3 = tmp[1] = b + c
1139
* d4 = tmp[2] = b - c
1140
* d5 = tmp[3] = a - d
1141
*/
1142
"vqadd.s16 d2, d22, d21 \n"
1143
"vqadd.s16 d3, d23, d20 \n"
1144
"vqsub.s16 d4, d23, d20 \n"
1145
"vqsub.s16 d5, d22, d21 \n"
1146
1147
"vld1.32 d6[0], [%[dst]], %[kBPS] \n"
1148
"vld1.32 d6[1], [%[dst]], %[kBPS] \n"
1149
"vld1.32 d7[0], [%[dst]], %[kBPS] \n"
1150
"vld1.32 d7[1], [%[dst]], %[kBPS] \n"
1151
1152
"sub %[dst], %[dst], %[kBPS], lsl #2 \n"
1153
1154
/* (val) + 4 >> 3 */
1155
"vrshr.s16 d2, d2, #3 \n"
1156
"vrshr.s16 d3, d3, #3 \n"
1157
"vrshr.s16 d4, d4, #3 \n"
1158
"vrshr.s16 d5, d5, #3 \n"
1159
1160
"vzip.16 q1, q2 \n"
1161
"vzip.16 q1, q2 \n"
1162
1163
/* Must accumulate before saturating */
1164
"vmovl.u8 q8, d6 \n"
1165
"vmovl.u8 q9, d7 \n"
1166
1167
"vqadd.s16 q1, q1, q8 \n"
1168
"vqadd.s16 q2, q2, q9 \n"
1169
1170
"vqmovun.s16 d0, q1 \n"
1171
"vqmovun.s16 d1, q2 \n"
1172
1173
"vst1.32 d0[0], [%[dst]], %[kBPS] \n"
1174
"vst1.32 d0[1], [%[dst]], %[kBPS] \n"
1175
"vst1.32 d1[0], [%[dst]], %[kBPS] \n"
1176
"vst1.32 d1[1], [%[dst]] \n"
1177
1178
: [in] "+r"(in), [dst] "+r"(dst) /* modified registers */
1179
: [kBPS] "r"(kBPS), [constants] "r"(constants) /* constants */
1180
: "memory", "q0", "q1", "q2", "q8", "q9", "q10", "q11" /* clobbered */
1181
);
1182
}
1183
1184
#endif // WEBP_USE_INTRINSICS
1185
1186
static void TransformTwo_NEON(const int16_t* in, uint8_t* dst, int do_two) {
1187
TransformOne_NEON(in, dst);
1188
if (do_two) {
1189
TransformOne_NEON(in + 16, dst + 4);
1190
}
1191
}
1192
1193
static void TransformDC_NEON(const int16_t* in, uint8_t* dst) {
1194
const int16x8_t DC = vdupq_n_s16(in[0]);
1195
Add4x4_NEON(DC, DC, dst);
1196
}
1197
1198
//------------------------------------------------------------------------------
1199
1200
#define STORE_WHT(dst, col, rows) do { \
1201
*dst = vgetq_lane_s32(rows.val[0], col); (dst) += 16; \
1202
*dst = vgetq_lane_s32(rows.val[1], col); (dst) += 16; \
1203
*dst = vgetq_lane_s32(rows.val[2], col); (dst) += 16; \
1204
*dst = vgetq_lane_s32(rows.val[3], col); (dst) += 16; \
1205
} while (0)
1206
1207
static void TransformWHT_NEON(const int16_t* in, int16_t* out) {
1208
int32x4x4_t tmp;
1209
1210
{
1211
// Load the source.
1212
const int16x4_t in00_03 = vld1_s16(in + 0);
1213
const int16x4_t in04_07 = vld1_s16(in + 4);
1214
const int16x4_t in08_11 = vld1_s16(in + 8);
1215
const int16x4_t in12_15 = vld1_s16(in + 12);
1216
const int32x4_t a0 = vaddl_s16(in00_03, in12_15); // in[0..3] + in[12..15]
1217
const int32x4_t a1 = vaddl_s16(in04_07, in08_11); // in[4..7] + in[8..11]
1218
const int32x4_t a2 = vsubl_s16(in04_07, in08_11); // in[4..7] - in[8..11]
1219
const int32x4_t a3 = vsubl_s16(in00_03, in12_15); // in[0..3] - in[12..15]
1220
tmp.val[0] = vaddq_s32(a0, a1);
1221
tmp.val[1] = vaddq_s32(a3, a2);
1222
tmp.val[2] = vsubq_s32(a0, a1);
1223
tmp.val[3] = vsubq_s32(a3, a2);
1224
// Arrange the temporary results column-wise.
1225
tmp = Transpose4x4_NEON(tmp);
1226
}
1227
1228
{
1229
const int32x4_t kCst3 = vdupq_n_s32(3);
1230
const int32x4_t dc = vaddq_s32(tmp.val[0], kCst3); // add rounder
1231
const int32x4_t a0 = vaddq_s32(dc, tmp.val[3]);
1232
const int32x4_t a1 = vaddq_s32(tmp.val[1], tmp.val[2]);
1233
const int32x4_t a2 = vsubq_s32(tmp.val[1], tmp.val[2]);
1234
const int32x4_t a3 = vsubq_s32(dc, tmp.val[3]);
1235
1236
tmp.val[0] = vaddq_s32(a0, a1);
1237
tmp.val[1] = vaddq_s32(a3, a2);
1238
tmp.val[2] = vsubq_s32(a0, a1);
1239
tmp.val[3] = vsubq_s32(a3, a2);
1240
1241
// right shift the results by 3.
1242
tmp.val[0] = vshrq_n_s32(tmp.val[0], 3);
1243
tmp.val[1] = vshrq_n_s32(tmp.val[1], 3);
1244
tmp.val[2] = vshrq_n_s32(tmp.val[2], 3);
1245
tmp.val[3] = vshrq_n_s32(tmp.val[3], 3);
1246
1247
STORE_WHT(out, 0, tmp);
1248
STORE_WHT(out, 1, tmp);
1249
STORE_WHT(out, 2, tmp);
1250
STORE_WHT(out, 3, tmp);
1251
}
1252
}
1253
1254
#undef STORE_WHT
1255
1256
//------------------------------------------------------------------------------
1257
1258
#define MUL(a, b) (((a) * (b)) >> 16)
1259
static void TransformAC3_NEON(const int16_t* in, uint8_t* dst) {
1260
static const int kC1_full = 20091 + (1 << 16);
1261
static const int kC2_full = 35468;
1262
const int16x4_t A = vld1_dup_s16(in);
1263
const int16x4_t c4 = vdup_n_s16(MUL(in[4], kC2_full));
1264
const int16x4_t d4 = vdup_n_s16(MUL(in[4], kC1_full));
1265
const int c1 = MUL(in[1], kC2_full);
1266
const int d1 = MUL(in[1], kC1_full);
1267
const uint64_t cd = (uint64_t)( d1 & 0xffff) << 0 |
1268
(uint64_t)( c1 & 0xffff) << 16 |
1269
(uint64_t)(-c1 & 0xffff) << 32 |
1270
(uint64_t)(-d1 & 0xffff) << 48;
1271
const int16x4_t CD = vcreate_s16(cd);
1272
const int16x4_t B = vqadd_s16(A, CD);
1273
const int16x8_t m0_m1 = vcombine_s16(vqadd_s16(B, d4), vqadd_s16(B, c4));
1274
const int16x8_t m2_m3 = vcombine_s16(vqsub_s16(B, c4), vqsub_s16(B, d4));
1275
Add4x4_NEON(m0_m1, m2_m3, dst);
1276
}
1277
#undef MUL
1278
1279
//------------------------------------------------------------------------------
1280
// 4x4
1281
1282
static void DC4_NEON(uint8_t* dst) { // DC
1283
const uint8x8_t A = vld1_u8(dst - BPS); // top row
1284
const uint16x4_t p0 = vpaddl_u8(A); // cascading summation of the top
1285
const uint16x4_t p1 = vpadd_u16(p0, p0);
1286
const uint16x8_t L0 = vmovl_u8(vld1_u8(dst + 0 * BPS - 1));
1287
const uint16x8_t L1 = vmovl_u8(vld1_u8(dst + 1 * BPS - 1));
1288
const uint16x8_t L2 = vmovl_u8(vld1_u8(dst + 2 * BPS - 1));
1289
const uint16x8_t L3 = vmovl_u8(vld1_u8(dst + 3 * BPS - 1));
1290
const uint16x8_t s0 = vaddq_u16(L0, L1);
1291
const uint16x8_t s1 = vaddq_u16(L2, L3);
1292
const uint16x8_t s01 = vaddq_u16(s0, s1);
1293
const uint16x8_t sum = vaddq_u16(s01, vcombine_u16(p1, p1));
1294
const uint8x8_t dc0 = vrshrn_n_u16(sum, 3); // (sum + 4) >> 3
1295
const uint8x8_t dc = vdup_lane_u8(dc0, 0);
1296
int i;
1297
for (i = 0; i < 4; ++i) {
1298
vst1_lane_u32((uint32_t*)(dst + i * BPS), vreinterpret_u32_u8(dc), 0);
1299
}
1300
}
1301
1302
// TrueMotion (4x4 + 8x8)
1303
static WEBP_INLINE void TrueMotion_NEON(uint8_t* dst, int size) {
1304
const uint8x8_t TL = vld1_dup_u8(dst - BPS - 1); // top-left pixel 'A[-1]'
1305
const uint8x8_t T = vld1_u8(dst - BPS); // top row 'A[0..3]'
1306
const int16x8_t d = vreinterpretq_s16_u16(vsubl_u8(T, TL)); // A[c] - A[-1]
1307
int y;
1308
for (y = 0; y < size; y += 4) {
1309
// left edge
1310
const int16x8_t L0 = ConvertU8ToS16_NEON(vld1_dup_u8(dst + 0 * BPS - 1));
1311
const int16x8_t L1 = ConvertU8ToS16_NEON(vld1_dup_u8(dst + 1 * BPS - 1));
1312
const int16x8_t L2 = ConvertU8ToS16_NEON(vld1_dup_u8(dst + 2 * BPS - 1));
1313
const int16x8_t L3 = ConvertU8ToS16_NEON(vld1_dup_u8(dst + 3 * BPS - 1));
1314
const int16x8_t r0 = vaddq_s16(L0, d); // L[r] + A[c] - A[-1]
1315
const int16x8_t r1 = vaddq_s16(L1, d);
1316
const int16x8_t r2 = vaddq_s16(L2, d);
1317
const int16x8_t r3 = vaddq_s16(L3, d);
1318
// Saturate and store the result.
1319
const uint32x2_t r0_u32 = vreinterpret_u32_u8(vqmovun_s16(r0));
1320
const uint32x2_t r1_u32 = vreinterpret_u32_u8(vqmovun_s16(r1));
1321
const uint32x2_t r2_u32 = vreinterpret_u32_u8(vqmovun_s16(r2));
1322
const uint32x2_t r3_u32 = vreinterpret_u32_u8(vqmovun_s16(r3));
1323
if (size == 4) {
1324
vst1_lane_u32((uint32_t*)(dst + 0 * BPS), r0_u32, 0);
1325
vst1_lane_u32((uint32_t*)(dst + 1 * BPS), r1_u32, 0);
1326
vst1_lane_u32((uint32_t*)(dst + 2 * BPS), r2_u32, 0);
1327
vst1_lane_u32((uint32_t*)(dst + 3 * BPS), r3_u32, 0);
1328
} else {
1329
vst1_u32((uint32_t*)(dst + 0 * BPS), r0_u32);
1330
vst1_u32((uint32_t*)(dst + 1 * BPS), r1_u32);
1331
vst1_u32((uint32_t*)(dst + 2 * BPS), r2_u32);
1332
vst1_u32((uint32_t*)(dst + 3 * BPS), r3_u32);
1333
}
1334
dst += 4 * BPS;
1335
}
1336
}
1337
1338
static void TM4_NEON(uint8_t* dst) { TrueMotion_NEON(dst, 4); }
1339
1340
static void VE4_NEON(uint8_t* dst) { // vertical
1341
// NB: avoid vld1_u64 here as an alignment hint may be added -> SIGBUS.
1342
const uint64x1_t A0 = vreinterpret_u64_u8(vld1_u8(dst - BPS - 1)); // top row
1343
const uint64x1_t A1 = vshr_n_u64(A0, 8);
1344
const uint64x1_t A2 = vshr_n_u64(A0, 16);
1345
const uint8x8_t ABCDEFGH = vreinterpret_u8_u64(A0);
1346
const uint8x8_t BCDEFGH0 = vreinterpret_u8_u64(A1);
1347
const uint8x8_t CDEFGH00 = vreinterpret_u8_u64(A2);
1348
const uint8x8_t b = vhadd_u8(ABCDEFGH, CDEFGH00);
1349
const uint8x8_t avg = vrhadd_u8(b, BCDEFGH0);
1350
int i;
1351
for (i = 0; i < 4; ++i) {
1352
vst1_lane_u32((uint32_t*)(dst + i * BPS), vreinterpret_u32_u8(avg), 0);
1353
}
1354
}
1355
1356
static void RD4_NEON(uint8_t* dst) { // Down-right
1357
const uint8x8_t XABCD_u8 = vld1_u8(dst - BPS - 1);
1358
const uint64x1_t XABCD = vreinterpret_u64_u8(XABCD_u8);
1359
const uint64x1_t ____XABC = vshl_n_u64(XABCD, 32);
1360
const uint32_t I = dst[-1 + 0 * BPS];
1361
const uint32_t J = dst[-1 + 1 * BPS];
1362
const uint32_t K = dst[-1 + 2 * BPS];
1363
const uint32_t L = dst[-1 + 3 * BPS];
1364
const uint64x1_t LKJI____ = vcreate_u64(L | (K << 8) | (J << 16) | (I << 24));
1365
const uint64x1_t LKJIXABC = vorr_u64(LKJI____, ____XABC);
1366
const uint8x8_t KJIXABC_ = vreinterpret_u8_u64(vshr_n_u64(LKJIXABC, 8));
1367
const uint8x8_t JIXABC__ = vreinterpret_u8_u64(vshr_n_u64(LKJIXABC, 16));
1368
const uint8_t D = vget_lane_u8(XABCD_u8, 4);
1369
const uint8x8_t JIXABCD_ = vset_lane_u8(D, JIXABC__, 6);
1370
const uint8x8_t LKJIXABC_u8 = vreinterpret_u8_u64(LKJIXABC);
1371
const uint8x8_t avg1 = vhadd_u8(JIXABCD_, LKJIXABC_u8);
1372
const uint8x8_t avg2 = vrhadd_u8(avg1, KJIXABC_);
1373
const uint64x1_t avg2_u64 = vreinterpret_u64_u8(avg2);
1374
const uint32x2_t r3 = vreinterpret_u32_u8(avg2);
1375
const uint32x2_t r2 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 8));
1376
const uint32x2_t r1 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 16));
1377
const uint32x2_t r0 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 24));
1378
vst1_lane_u32((uint32_t*)(dst + 0 * BPS), r0, 0);
1379
vst1_lane_u32((uint32_t*)(dst + 1 * BPS), r1, 0);
1380
vst1_lane_u32((uint32_t*)(dst + 2 * BPS), r2, 0);
1381
vst1_lane_u32((uint32_t*)(dst + 3 * BPS), r3, 0);
1382
}
1383
1384
static void LD4_NEON(uint8_t* dst) { // Down-left
1385
// Note using the same shift trick as VE4() is slower here.
1386
const uint8x8_t ABCDEFGH = vld1_u8(dst - BPS + 0);
1387
const uint8x8_t BCDEFGH0 = vld1_u8(dst - BPS + 1);
1388
const uint8x8_t CDEFGH00 = vld1_u8(dst - BPS + 2);
1389
const uint8x8_t CDEFGHH0 = vset_lane_u8(dst[-BPS + 7], CDEFGH00, 6);
1390
const uint8x8_t avg1 = vhadd_u8(ABCDEFGH, CDEFGHH0);
1391
const uint8x8_t avg2 = vrhadd_u8(avg1, BCDEFGH0);
1392
const uint64x1_t avg2_u64 = vreinterpret_u64_u8(avg2);
1393
const uint32x2_t r0 = vreinterpret_u32_u8(avg2);
1394
const uint32x2_t r1 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 8));
1395
const uint32x2_t r2 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 16));
1396
const uint32x2_t r3 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 24));
1397
vst1_lane_u32((uint32_t*)(dst + 0 * BPS), r0, 0);
1398
vst1_lane_u32((uint32_t*)(dst + 1 * BPS), r1, 0);
1399
vst1_lane_u32((uint32_t*)(dst + 2 * BPS), r2, 0);
1400
vst1_lane_u32((uint32_t*)(dst + 3 * BPS), r3, 0);
1401
}
1402
1403
//------------------------------------------------------------------------------
1404
// Chroma
1405
1406
static void VE8uv_NEON(uint8_t* dst) { // vertical
1407
const uint8x8_t top = vld1_u8(dst - BPS);
1408
int j;
1409
for (j = 0; j < 8; ++j) {
1410
vst1_u8(dst + j * BPS, top);
1411
}
1412
}
1413
1414
static void HE8uv_NEON(uint8_t* dst) { // horizontal
1415
int j;
1416
for (j = 0; j < 8; ++j) {
1417
const uint8x8_t left = vld1_dup_u8(dst - 1);
1418
vst1_u8(dst, left);
1419
dst += BPS;
1420
}
1421
}
1422
1423
static WEBP_INLINE void DC8_NEON(uint8_t* dst, int do_top, int do_left) {
1424
uint16x8_t sum_top;
1425
uint16x8_t sum_left;
1426
uint8x8_t dc0;
1427
1428
if (do_top) {
1429
const uint8x8_t A = vld1_u8(dst - BPS); // top row
1430
const uint16x4_t p0 = vpaddl_u8(A); // cascading summation of the top
1431
const uint16x4_t p1 = vpadd_u16(p0, p0);
1432
const uint16x4_t p2 = vpadd_u16(p1, p1);
1433
sum_top = vcombine_u16(p2, p2);
1434
}
1435
1436
if (do_left) {
1437
const uint16x8_t L0 = vmovl_u8(vld1_u8(dst + 0 * BPS - 1));
1438
const uint16x8_t L1 = vmovl_u8(vld1_u8(dst + 1 * BPS - 1));
1439
const uint16x8_t L2 = vmovl_u8(vld1_u8(dst + 2 * BPS - 1));
1440
const uint16x8_t L3 = vmovl_u8(vld1_u8(dst + 3 * BPS - 1));
1441
const uint16x8_t L4 = vmovl_u8(vld1_u8(dst + 4 * BPS - 1));
1442
const uint16x8_t L5 = vmovl_u8(vld1_u8(dst + 5 * BPS - 1));
1443
const uint16x8_t L6 = vmovl_u8(vld1_u8(dst + 6 * BPS - 1));
1444
const uint16x8_t L7 = vmovl_u8(vld1_u8(dst + 7 * BPS - 1));
1445
const uint16x8_t s0 = vaddq_u16(L0, L1);
1446
const uint16x8_t s1 = vaddq_u16(L2, L3);
1447
const uint16x8_t s2 = vaddq_u16(L4, L5);
1448
const uint16x8_t s3 = vaddq_u16(L6, L7);
1449
const uint16x8_t s01 = vaddq_u16(s0, s1);
1450
const uint16x8_t s23 = vaddq_u16(s2, s3);
1451
sum_left = vaddq_u16(s01, s23);
1452
}
1453
1454
if (do_top && do_left) {
1455
const uint16x8_t sum = vaddq_u16(sum_left, sum_top);
1456
dc0 = vrshrn_n_u16(sum, 4);
1457
} else if (do_top) {
1458
dc0 = vrshrn_n_u16(sum_top, 3);
1459
} else if (do_left) {
1460
dc0 = vrshrn_n_u16(sum_left, 3);
1461
} else {
1462
dc0 = vdup_n_u8(0x80);
1463
}
1464
1465
{
1466
const uint8x8_t dc = vdup_lane_u8(dc0, 0);
1467
int i;
1468
for (i = 0; i < 8; ++i) {
1469
vst1_u32((uint32_t*)(dst + i * BPS), vreinterpret_u32_u8(dc));
1470
}
1471
}
1472
}
1473
1474
static void DC8uv_NEON(uint8_t* dst) { DC8_NEON(dst, 1, 1); }
1475
static void DC8uvNoTop_NEON(uint8_t* dst) { DC8_NEON(dst, 0, 1); }
1476
static void DC8uvNoLeft_NEON(uint8_t* dst) { DC8_NEON(dst, 1, 0); }
1477
static void DC8uvNoTopLeft_NEON(uint8_t* dst) { DC8_NEON(dst, 0, 0); }
1478
1479
static void TM8uv_NEON(uint8_t* dst) { TrueMotion_NEON(dst, 8); }
1480
1481
//------------------------------------------------------------------------------
1482
// 16x16
1483
1484
static void VE16_NEON(uint8_t* dst) { // vertical
1485
const uint8x16_t top = vld1q_u8(dst - BPS);
1486
int j;
1487
for (j = 0; j < 16; ++j) {
1488
vst1q_u8(dst + j * BPS, top);
1489
}
1490
}
1491
1492
static void HE16_NEON(uint8_t* dst) { // horizontal
1493
int j;
1494
for (j = 0; j < 16; ++j) {
1495
const uint8x16_t left = vld1q_dup_u8(dst - 1);
1496
vst1q_u8(dst, left);
1497
dst += BPS;
1498
}
1499
}
1500
1501
static WEBP_INLINE void DC16_NEON(uint8_t* dst, int do_top, int do_left) {
1502
uint16x8_t sum_top;
1503
uint16x8_t sum_left;
1504
uint8x8_t dc0;
1505
1506
if (do_top) {
1507
const uint8x16_t A = vld1q_u8(dst - BPS); // top row
1508
const uint16x8_t p0 = vpaddlq_u8(A); // cascading summation of the top
1509
const uint16x4_t p1 = vadd_u16(vget_low_u16(p0), vget_high_u16(p0));
1510
const uint16x4_t p2 = vpadd_u16(p1, p1);
1511
const uint16x4_t p3 = vpadd_u16(p2, p2);
1512
sum_top = vcombine_u16(p3, p3);
1513
}
1514
1515
if (do_left) {
1516
int i;
1517
sum_left = vdupq_n_u16(0);
1518
for (i = 0; i < 16; i += 8) {
1519
const uint16x8_t L0 = vmovl_u8(vld1_u8(dst + (i + 0) * BPS - 1));
1520
const uint16x8_t L1 = vmovl_u8(vld1_u8(dst + (i + 1) * BPS - 1));
1521
const uint16x8_t L2 = vmovl_u8(vld1_u8(dst + (i + 2) * BPS - 1));
1522
const uint16x8_t L3 = vmovl_u8(vld1_u8(dst + (i + 3) * BPS - 1));
1523
const uint16x8_t L4 = vmovl_u8(vld1_u8(dst + (i + 4) * BPS - 1));
1524
const uint16x8_t L5 = vmovl_u8(vld1_u8(dst + (i + 5) * BPS - 1));
1525
const uint16x8_t L6 = vmovl_u8(vld1_u8(dst + (i + 6) * BPS - 1));
1526
const uint16x8_t L7 = vmovl_u8(vld1_u8(dst + (i + 7) * BPS - 1));
1527
const uint16x8_t s0 = vaddq_u16(L0, L1);
1528
const uint16x8_t s1 = vaddq_u16(L2, L3);
1529
const uint16x8_t s2 = vaddq_u16(L4, L5);
1530
const uint16x8_t s3 = vaddq_u16(L6, L7);
1531
const uint16x8_t s01 = vaddq_u16(s0, s1);
1532
const uint16x8_t s23 = vaddq_u16(s2, s3);
1533
const uint16x8_t sum = vaddq_u16(s01, s23);
1534
sum_left = vaddq_u16(sum_left, sum);
1535
}
1536
}
1537
1538
if (do_top && do_left) {
1539
const uint16x8_t sum = vaddq_u16(sum_left, sum_top);
1540
dc0 = vrshrn_n_u16(sum, 5);
1541
} else if (do_top) {
1542
dc0 = vrshrn_n_u16(sum_top, 4);
1543
} else if (do_left) {
1544
dc0 = vrshrn_n_u16(sum_left, 4);
1545
} else {
1546
dc0 = vdup_n_u8(0x80);
1547
}
1548
1549
{
1550
const uint8x16_t dc = vdupq_lane_u8(dc0, 0);
1551
int i;
1552
for (i = 0; i < 16; ++i) {
1553
vst1q_u8(dst + i * BPS, dc);
1554
}
1555
}
1556
}
1557
1558
static void DC16TopLeft_NEON(uint8_t* dst) { DC16_NEON(dst, 1, 1); }
1559
static void DC16NoTop_NEON(uint8_t* dst) { DC16_NEON(dst, 0, 1); }
1560
static void DC16NoLeft_NEON(uint8_t* dst) { DC16_NEON(dst, 1, 0); }
1561
static void DC16NoTopLeft_NEON(uint8_t* dst) { DC16_NEON(dst, 0, 0); }
1562
1563
static void TM16_NEON(uint8_t* dst) {
1564
const uint8x8_t TL = vld1_dup_u8(dst - BPS - 1); // top-left pixel 'A[-1]'
1565
const uint8x16_t T = vld1q_u8(dst - BPS); // top row 'A[0..15]'
1566
// A[c] - A[-1]
1567
const int16x8_t d_lo = vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(T), TL));
1568
const int16x8_t d_hi = vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(T), TL));
1569
int y;
1570
for (y = 0; y < 16; y += 4) {
1571
// left edge
1572
const int16x8_t L0 = ConvertU8ToS16_NEON(vld1_dup_u8(dst + 0 * BPS - 1));
1573
const int16x8_t L1 = ConvertU8ToS16_NEON(vld1_dup_u8(dst + 1 * BPS - 1));
1574
const int16x8_t L2 = ConvertU8ToS16_NEON(vld1_dup_u8(dst + 2 * BPS - 1));
1575
const int16x8_t L3 = ConvertU8ToS16_NEON(vld1_dup_u8(dst + 3 * BPS - 1));
1576
const int16x8_t r0_lo = vaddq_s16(L0, d_lo); // L[r] + A[c] - A[-1]
1577
const int16x8_t r1_lo = vaddq_s16(L1, d_lo);
1578
const int16x8_t r2_lo = vaddq_s16(L2, d_lo);
1579
const int16x8_t r3_lo = vaddq_s16(L3, d_lo);
1580
const int16x8_t r0_hi = vaddq_s16(L0, d_hi);
1581
const int16x8_t r1_hi = vaddq_s16(L1, d_hi);
1582
const int16x8_t r2_hi = vaddq_s16(L2, d_hi);
1583
const int16x8_t r3_hi = vaddq_s16(L3, d_hi);
1584
// Saturate and store the result.
1585
const uint8x16_t row0 = vcombine_u8(vqmovun_s16(r0_lo), vqmovun_s16(r0_hi));
1586
const uint8x16_t row1 = vcombine_u8(vqmovun_s16(r1_lo), vqmovun_s16(r1_hi));
1587
const uint8x16_t row2 = vcombine_u8(vqmovun_s16(r2_lo), vqmovun_s16(r2_hi));
1588
const uint8x16_t row3 = vcombine_u8(vqmovun_s16(r3_lo), vqmovun_s16(r3_hi));
1589
vst1q_u8(dst + 0 * BPS, row0);
1590
vst1q_u8(dst + 1 * BPS, row1);
1591
vst1q_u8(dst + 2 * BPS, row2);
1592
vst1q_u8(dst + 3 * BPS, row3);
1593
dst += 4 * BPS;
1594
}
1595
}
1596
1597
//------------------------------------------------------------------------------
1598
// Entry point
1599
1600
extern void VP8DspInitNEON(void);
1601
1602
WEBP_TSAN_IGNORE_FUNCTION void VP8DspInitNEON(void) {
1603
VP8Transform = TransformTwo_NEON;
1604
VP8TransformAC3 = TransformAC3_NEON;
1605
VP8TransformDC = TransformDC_NEON;
1606
VP8TransformWHT = TransformWHT_NEON;
1607
1608
VP8VFilter16 = VFilter16_NEON;
1609
VP8VFilter16i = VFilter16i_NEON;
1610
VP8HFilter16 = HFilter16_NEON;
1611
#if !defined(WORK_AROUND_GCC)
1612
VP8HFilter16i = HFilter16i_NEON;
1613
#endif
1614
VP8VFilter8 = VFilter8_NEON;
1615
VP8VFilter8i = VFilter8i_NEON;
1616
#if !defined(WORK_AROUND_GCC)
1617
VP8HFilter8 = HFilter8_NEON;
1618
VP8HFilter8i = HFilter8i_NEON;
1619
#endif
1620
VP8SimpleVFilter16 = SimpleVFilter16_NEON;
1621
VP8SimpleHFilter16 = SimpleHFilter16_NEON;
1622
VP8SimpleVFilter16i = SimpleVFilter16i_NEON;
1623
VP8SimpleHFilter16i = SimpleHFilter16i_NEON;
1624
1625
VP8PredLuma4[0] = DC4_NEON;
1626
VP8PredLuma4[1] = TM4_NEON;
1627
VP8PredLuma4[2] = VE4_NEON;
1628
VP8PredLuma4[4] = RD4_NEON;
1629
VP8PredLuma4[6] = LD4_NEON;
1630
1631
VP8PredLuma16[0] = DC16TopLeft_NEON;
1632
VP8PredLuma16[1] = TM16_NEON;
1633
VP8PredLuma16[2] = VE16_NEON;
1634
VP8PredLuma16[3] = HE16_NEON;
1635
VP8PredLuma16[4] = DC16NoTop_NEON;
1636
VP8PredLuma16[5] = DC16NoLeft_NEON;
1637
VP8PredLuma16[6] = DC16NoTopLeft_NEON;
1638
1639
VP8PredChroma8[0] = DC8uv_NEON;
1640
VP8PredChroma8[1] = TM8uv_NEON;
1641
VP8PredChroma8[2] = VE8uv_NEON;
1642
VP8PredChroma8[3] = HE8uv_NEON;
1643
VP8PredChroma8[4] = DC8uvNoTop_NEON;
1644
VP8PredChroma8[5] = DC8uvNoLeft_NEON;
1645
VP8PredChroma8[6] = DC8uvNoTopLeft_NEON;
1646
}
1647
1648
#else // !WEBP_USE_NEON
1649
1650
WEBP_DSP_INIT_STUB(VP8DspInitNEON)
1651
1652
#endif // WEBP_USE_NEON
1653
1654