Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
godotengine
GitHub Repository: godotengine/godot
Path: blob/master/thirdparty/libwebp/src/dsp/enc_neon.c
9913 views
1
// Copyright 2012 Google Inc. All Rights Reserved.
2
//
3
// Use of this source code is governed by a BSD-style license
4
// that can be found in the COPYING file in the root of the source
5
// tree. An additional intellectual property rights grant can be found
6
// in the file PATENTS. All contributing project authors may
7
// be found in the AUTHORS file in the root of the source tree.
8
// -----------------------------------------------------------------------------
9
//
10
// ARM NEON version of speed-critical encoding functions.
11
//
12
// adapted from libvpx (https://www.webmproject.org/code/)
13
14
#include "src/dsp/dsp.h"
15
16
#if defined(WEBP_USE_NEON)
17
18
#include <assert.h>
19
20
#include "src/dsp/neon.h"
21
#include "src/enc/vp8i_enc.h"
22
23
//------------------------------------------------------------------------------
24
// Transforms (Paragraph 14.4)
25
26
// Inverse transform.
27
// This code is pretty much the same as TransformOne in the dec_neon.c, except
28
// for subtraction to *ref. See the comments there for algorithmic explanations.
29
30
static const int16_t kC1 = WEBP_TRANSFORM_AC3_C1;
31
static const int16_t kC2 =
32
WEBP_TRANSFORM_AC3_C2 / 2; // half of kC2, actually. See comment above.
33
34
// This code works but is *slower* than the inlined-asm version below
35
// (with gcc-4.6). So we disable it for now. Later, it'll be conditional to
36
// WEBP_USE_INTRINSICS define.
37
// With gcc-4.8, it's a little faster speed than inlined-assembly.
38
#if defined(WEBP_USE_INTRINSICS)
39
40
// Treats 'v' as an uint8x8_t and zero extends to an int16x8_t.
41
static WEBP_INLINE int16x8_t ConvertU8ToS16_NEON(uint32x2_t v) {
42
return vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_u32(v)));
43
}
44
45
// Performs unsigned 8b saturation on 'dst01' and 'dst23' storing the result
46
// to the corresponding rows of 'dst'.
47
static WEBP_INLINE void SaturateAndStore4x4_NEON(uint8_t* const dst,
48
const int16x8_t dst01,
49
const int16x8_t dst23) {
50
// Unsigned saturate to 8b.
51
const uint8x8_t dst01_u8 = vqmovun_s16(dst01);
52
const uint8x8_t dst23_u8 = vqmovun_s16(dst23);
53
54
// Store the results.
55
vst1_lane_u32((uint32_t*)(dst + 0 * BPS), vreinterpret_u32_u8(dst01_u8), 0);
56
vst1_lane_u32((uint32_t*)(dst + 1 * BPS), vreinterpret_u32_u8(dst01_u8), 1);
57
vst1_lane_u32((uint32_t*)(dst + 2 * BPS), vreinterpret_u32_u8(dst23_u8), 0);
58
vst1_lane_u32((uint32_t*)(dst + 3 * BPS), vreinterpret_u32_u8(dst23_u8), 1);
59
}
60
61
static WEBP_INLINE void Add4x4_NEON(const int16x8_t row01,
62
const int16x8_t row23,
63
const uint8_t* WEBP_RESTRICT const ref,
64
uint8_t* WEBP_RESTRICT const dst) {
65
uint32x2_t dst01 = vdup_n_u32(0);
66
uint32x2_t dst23 = vdup_n_u32(0);
67
68
// Load the source pixels.
69
dst01 = vld1_lane_u32((uint32_t*)(ref + 0 * BPS), dst01, 0);
70
dst23 = vld1_lane_u32((uint32_t*)(ref + 2 * BPS), dst23, 0);
71
dst01 = vld1_lane_u32((uint32_t*)(ref + 1 * BPS), dst01, 1);
72
dst23 = vld1_lane_u32((uint32_t*)(ref + 3 * BPS), dst23, 1);
73
74
{
75
// Convert to 16b.
76
const int16x8_t dst01_s16 = ConvertU8ToS16_NEON(dst01);
77
const int16x8_t dst23_s16 = ConvertU8ToS16_NEON(dst23);
78
79
// Descale with rounding.
80
const int16x8_t out01 = vrsraq_n_s16(dst01_s16, row01, 3);
81
const int16x8_t out23 = vrsraq_n_s16(dst23_s16, row23, 3);
82
// Add the inverse transform.
83
SaturateAndStore4x4_NEON(dst, out01, out23);
84
}
85
}
86
87
static WEBP_INLINE void Transpose8x2_NEON(const int16x8_t in0,
88
const int16x8_t in1,
89
int16x8x2_t* const out) {
90
// a0 a1 a2 a3 | b0 b1 b2 b3 => a0 b0 c0 d0 | a1 b1 c1 d1
91
// c0 c1 c2 c3 | d0 d1 d2 d3 a2 b2 c2 d2 | a3 b3 c3 d3
92
const int16x8x2_t tmp0 = vzipq_s16(in0, in1); // a0 c0 a1 c1 a2 c2 ...
93
// b0 d0 b1 d1 b2 d2 ...
94
*out = vzipq_s16(tmp0.val[0], tmp0.val[1]);
95
}
96
97
static WEBP_INLINE void TransformPass_NEON(int16x8x2_t* const rows) {
98
// {rows} = in0 | in4
99
// in8 | in12
100
// B1 = in4 | in12
101
const int16x8_t B1 =
102
vcombine_s16(vget_high_s16(rows->val[0]), vget_high_s16(rows->val[1]));
103
// C0 = kC1 * in4 | kC1 * in12
104
// C1 = kC2 * in4 | kC2 * in12
105
const int16x8_t C0 = vsraq_n_s16(B1, vqdmulhq_n_s16(B1, kC1), 1);
106
const int16x8_t C1 = vqdmulhq_n_s16(B1, kC2);
107
const int16x4_t a = vqadd_s16(vget_low_s16(rows->val[0]),
108
vget_low_s16(rows->val[1])); // in0 + in8
109
const int16x4_t b = vqsub_s16(vget_low_s16(rows->val[0]),
110
vget_low_s16(rows->val[1])); // in0 - in8
111
// c = kC2 * in4 - kC1 * in12
112
// d = kC1 * in4 + kC2 * in12
113
const int16x4_t c = vqsub_s16(vget_low_s16(C1), vget_high_s16(C0));
114
const int16x4_t d = vqadd_s16(vget_low_s16(C0), vget_high_s16(C1));
115
const int16x8_t D0 = vcombine_s16(a, b); // D0 = a | b
116
const int16x8_t D1 = vcombine_s16(d, c); // D1 = d | c
117
const int16x8_t E0 = vqaddq_s16(D0, D1); // a+d | b+c
118
const int16x8_t E_tmp = vqsubq_s16(D0, D1); // a-d | b-c
119
const int16x8_t E1 = vcombine_s16(vget_high_s16(E_tmp), vget_low_s16(E_tmp));
120
Transpose8x2_NEON(E0, E1, rows);
121
}
122
123
static void ITransformOne_NEON(const uint8_t* WEBP_RESTRICT ref,
124
const int16_t* WEBP_RESTRICT in,
125
uint8_t* WEBP_RESTRICT dst) {
126
int16x8x2_t rows;
127
INIT_VECTOR2(rows, vld1q_s16(in + 0), vld1q_s16(in + 8));
128
TransformPass_NEON(&rows);
129
TransformPass_NEON(&rows);
130
Add4x4_NEON(rows.val[0], rows.val[1], ref, dst);
131
}
132
133
#else
134
135
static void ITransformOne_NEON(const uint8_t* WEBP_RESTRICT ref,
136
const int16_t* WEBP_RESTRICT in,
137
uint8_t* WEBP_RESTRICT dst) {
138
const int kBPS = BPS;
139
const int16_t kC1C2[] = { kC1, kC2, 0, 0 };
140
141
__asm__ volatile (
142
"vld1.16 {q1, q2}, [%[in]] \n"
143
"vld1.16 {d0}, [%[kC1C2]] \n"
144
145
// d2: in[0]
146
// d3: in[8]
147
// d4: in[4]
148
// d5: in[12]
149
"vswp d3, d4 \n"
150
151
// q8 = {in[4], in[12]} * kC1 * 2 >> 16
152
// q9 = {in[4], in[12]} * kC2 >> 16
153
"vqdmulh.s16 q8, q2, d0[0] \n"
154
"vqdmulh.s16 q9, q2, d0[1] \n"
155
156
// d22 = a = in[0] + in[8]
157
// d23 = b = in[0] - in[8]
158
"vqadd.s16 d22, d2, d3 \n"
159
"vqsub.s16 d23, d2, d3 \n"
160
161
// q8 = in[4]/[12] * kC1 >> 16
162
"vshr.s16 q8, q8, #1 \n"
163
164
// Add {in[4], in[12]} back after the multiplication.
165
"vqadd.s16 q8, q2, q8 \n"
166
167
// d20 = c = in[4]*kC2 - in[12]*kC1
168
// d21 = d = in[4]*kC1 + in[12]*kC2
169
"vqsub.s16 d20, d18, d17 \n"
170
"vqadd.s16 d21, d19, d16 \n"
171
172
// d2 = tmp[0] = a + d
173
// d3 = tmp[1] = b + c
174
// d4 = tmp[2] = b - c
175
// d5 = tmp[3] = a - d
176
"vqadd.s16 d2, d22, d21 \n"
177
"vqadd.s16 d3, d23, d20 \n"
178
"vqsub.s16 d4, d23, d20 \n"
179
"vqsub.s16 d5, d22, d21 \n"
180
181
"vzip.16 q1, q2 \n"
182
"vzip.16 q1, q2 \n"
183
184
"vswp d3, d4 \n"
185
186
// q8 = {tmp[4], tmp[12]} * kC1 * 2 >> 16
187
// q9 = {tmp[4], tmp[12]} * kC2 >> 16
188
"vqdmulh.s16 q8, q2, d0[0] \n"
189
"vqdmulh.s16 q9, q2, d0[1] \n"
190
191
// d22 = a = tmp[0] + tmp[8]
192
// d23 = b = tmp[0] - tmp[8]
193
"vqadd.s16 d22, d2, d3 \n"
194
"vqsub.s16 d23, d2, d3 \n"
195
196
"vshr.s16 q8, q8, #1 \n"
197
"vqadd.s16 q8, q2, q8 \n"
198
199
// d20 = c = in[4]*kC2 - in[12]*kC1
200
// d21 = d = in[4]*kC1 + in[12]*kC2
201
"vqsub.s16 d20, d18, d17 \n"
202
"vqadd.s16 d21, d19, d16 \n"
203
204
// d2 = tmp[0] = a + d
205
// d3 = tmp[1] = b + c
206
// d4 = tmp[2] = b - c
207
// d5 = tmp[3] = a - d
208
"vqadd.s16 d2, d22, d21 \n"
209
"vqadd.s16 d3, d23, d20 \n"
210
"vqsub.s16 d4, d23, d20 \n"
211
"vqsub.s16 d5, d22, d21 \n"
212
213
"vld1.32 d6[0], [%[ref]], %[kBPS] \n"
214
"vld1.32 d6[1], [%[ref]], %[kBPS] \n"
215
"vld1.32 d7[0], [%[ref]], %[kBPS] \n"
216
"vld1.32 d7[1], [%[ref]], %[kBPS] \n"
217
218
"sub %[ref], %[ref], %[kBPS], lsl #2 \n"
219
220
// (val) + 4 >> 3
221
"vrshr.s16 d2, d2, #3 \n"
222
"vrshr.s16 d3, d3, #3 \n"
223
"vrshr.s16 d4, d4, #3 \n"
224
"vrshr.s16 d5, d5, #3 \n"
225
226
"vzip.16 q1, q2 \n"
227
"vzip.16 q1, q2 \n"
228
229
// Must accumulate before saturating
230
"vmovl.u8 q8, d6 \n"
231
"vmovl.u8 q9, d7 \n"
232
233
"vqadd.s16 q1, q1, q8 \n"
234
"vqadd.s16 q2, q2, q9 \n"
235
236
"vqmovun.s16 d0, q1 \n"
237
"vqmovun.s16 d1, q2 \n"
238
239
"vst1.32 d0[0], [%[dst]], %[kBPS] \n"
240
"vst1.32 d0[1], [%[dst]], %[kBPS] \n"
241
"vst1.32 d1[0], [%[dst]], %[kBPS] \n"
242
"vst1.32 d1[1], [%[dst]] \n"
243
244
: [in] "+r"(in), [dst] "+r"(dst) // modified registers
245
: [kBPS] "r"(kBPS), [kC1C2] "r"(kC1C2), [ref] "r"(ref) // constants
246
: "memory", "q0", "q1", "q2", "q8", "q9", "q10", "q11" // clobbered
247
);
248
}
249
250
#endif // WEBP_USE_INTRINSICS
251
252
static void ITransform_NEON(const uint8_t* WEBP_RESTRICT ref,
253
const int16_t* WEBP_RESTRICT in,
254
uint8_t* WEBP_RESTRICT dst, int do_two) {
255
ITransformOne_NEON(ref, in, dst);
256
if (do_two) {
257
ITransformOne_NEON(ref + 4, in + 16, dst + 4);
258
}
259
}
260
261
// Load all 4x4 pixels into a single uint8x16_t variable.
262
static uint8x16_t Load4x4_NEON(const uint8_t* src) {
263
uint32x4_t out = vdupq_n_u32(0);
264
out = vld1q_lane_u32((const uint32_t*)(src + 0 * BPS), out, 0);
265
out = vld1q_lane_u32((const uint32_t*)(src + 1 * BPS), out, 1);
266
out = vld1q_lane_u32((const uint32_t*)(src + 2 * BPS), out, 2);
267
out = vld1q_lane_u32((const uint32_t*)(src + 3 * BPS), out, 3);
268
return vreinterpretq_u8_u32(out);
269
}
270
271
// Forward transform.
272
273
#if defined(WEBP_USE_INTRINSICS)
274
275
static WEBP_INLINE void Transpose4x4_S16_NEON(const int16x4_t A,
276
const int16x4_t B,
277
const int16x4_t C,
278
const int16x4_t D,
279
int16x8_t* const out01,
280
int16x8_t* const out32) {
281
const int16x4x2_t AB = vtrn_s16(A, B);
282
const int16x4x2_t CD = vtrn_s16(C, D);
283
const int32x2x2_t tmp02 = vtrn_s32(vreinterpret_s32_s16(AB.val[0]),
284
vreinterpret_s32_s16(CD.val[0]));
285
const int32x2x2_t tmp13 = vtrn_s32(vreinterpret_s32_s16(AB.val[1]),
286
vreinterpret_s32_s16(CD.val[1]));
287
*out01 = vreinterpretq_s16_s64(
288
vcombine_s64(vreinterpret_s64_s32(tmp02.val[0]),
289
vreinterpret_s64_s32(tmp13.val[0])));
290
*out32 = vreinterpretq_s16_s64(
291
vcombine_s64(vreinterpret_s64_s32(tmp13.val[1]),
292
vreinterpret_s64_s32(tmp02.val[1])));
293
}
294
295
static WEBP_INLINE int16x8_t DiffU8ToS16_NEON(const uint8x8_t a,
296
const uint8x8_t b) {
297
return vreinterpretq_s16_u16(vsubl_u8(a, b));
298
}
299
300
static void FTransform_NEON(const uint8_t* WEBP_RESTRICT src,
301
const uint8_t* WEBP_RESTRICT ref,
302
int16_t* WEBP_RESTRICT out) {
303
int16x8_t d0d1, d3d2; // working 4x4 int16 variables
304
{
305
const uint8x16_t S0 = Load4x4_NEON(src);
306
const uint8x16_t R0 = Load4x4_NEON(ref);
307
const int16x8_t D0D1 = DiffU8ToS16_NEON(vget_low_u8(S0), vget_low_u8(R0));
308
const int16x8_t D2D3 = DiffU8ToS16_NEON(vget_high_u8(S0), vget_high_u8(R0));
309
const int16x4_t D0 = vget_low_s16(D0D1);
310
const int16x4_t D1 = vget_high_s16(D0D1);
311
const int16x4_t D2 = vget_low_s16(D2D3);
312
const int16x4_t D3 = vget_high_s16(D2D3);
313
Transpose4x4_S16_NEON(D0, D1, D2, D3, &d0d1, &d3d2);
314
}
315
{ // 1rst pass
316
const int32x4_t kCst937 = vdupq_n_s32(937);
317
const int32x4_t kCst1812 = vdupq_n_s32(1812);
318
const int16x8_t a0a1 = vaddq_s16(d0d1, d3d2); // d0+d3 | d1+d2 (=a0|a1)
319
const int16x8_t a3a2 = vsubq_s16(d0d1, d3d2); // d0-d3 | d1-d2 (=a3|a2)
320
const int16x8_t a0a1_2 = vshlq_n_s16(a0a1, 3);
321
const int16x4_t tmp0 = vadd_s16(vget_low_s16(a0a1_2),
322
vget_high_s16(a0a1_2));
323
const int16x4_t tmp2 = vsub_s16(vget_low_s16(a0a1_2),
324
vget_high_s16(a0a1_2));
325
const int32x4_t a3_2217 = vmull_n_s16(vget_low_s16(a3a2), 2217);
326
const int32x4_t a2_2217 = vmull_n_s16(vget_high_s16(a3a2), 2217);
327
const int32x4_t a2_p_a3 = vmlal_n_s16(a2_2217, vget_low_s16(a3a2), 5352);
328
const int32x4_t a3_m_a2 = vmlsl_n_s16(a3_2217, vget_high_s16(a3a2), 5352);
329
const int16x4_t tmp1 = vshrn_n_s32(vaddq_s32(a2_p_a3, kCst1812), 9);
330
const int16x4_t tmp3 = vshrn_n_s32(vaddq_s32(a3_m_a2, kCst937), 9);
331
Transpose4x4_S16_NEON(tmp0, tmp1, tmp2, tmp3, &d0d1, &d3d2);
332
}
333
{ // 2nd pass
334
// the (1<<16) addition is for the replacement: a3!=0 <-> 1-(a3==0)
335
const int32x4_t kCst12000 = vdupq_n_s32(12000 + (1 << 16));
336
const int32x4_t kCst51000 = vdupq_n_s32(51000);
337
const int16x8_t a0a1 = vaddq_s16(d0d1, d3d2); // d0+d3 | d1+d2 (=a0|a1)
338
const int16x8_t a3a2 = vsubq_s16(d0d1, d3d2); // d0-d3 | d1-d2 (=a3|a2)
339
const int16x4_t a0_k7 = vadd_s16(vget_low_s16(a0a1), vdup_n_s16(7));
340
const int16x4_t out0 = vshr_n_s16(vadd_s16(a0_k7, vget_high_s16(a0a1)), 4);
341
const int16x4_t out2 = vshr_n_s16(vsub_s16(a0_k7, vget_high_s16(a0a1)), 4);
342
const int32x4_t a3_2217 = vmull_n_s16(vget_low_s16(a3a2), 2217);
343
const int32x4_t a2_2217 = vmull_n_s16(vget_high_s16(a3a2), 2217);
344
const int32x4_t a2_p_a3 = vmlal_n_s16(a2_2217, vget_low_s16(a3a2), 5352);
345
const int32x4_t a3_m_a2 = vmlsl_n_s16(a3_2217, vget_high_s16(a3a2), 5352);
346
const int16x4_t tmp1 = vaddhn_s32(a2_p_a3, kCst12000);
347
const int16x4_t out3 = vaddhn_s32(a3_m_a2, kCst51000);
348
const int16x4_t a3_eq_0 =
349
vreinterpret_s16_u16(vceq_s16(vget_low_s16(a3a2), vdup_n_s16(0)));
350
const int16x4_t out1 = vadd_s16(tmp1, a3_eq_0);
351
vst1_s16(out + 0, out0);
352
vst1_s16(out + 4, out1);
353
vst1_s16(out + 8, out2);
354
vst1_s16(out + 12, out3);
355
}
356
}
357
358
#else
359
360
// adapted from vp8/encoder/arm/neon/shortfdct_neon.asm
361
static const int16_t kCoeff16[] = {
362
5352, 5352, 5352, 5352, 2217, 2217, 2217, 2217
363
};
364
static const int32_t kCoeff32[] = {
365
1812, 1812, 1812, 1812,
366
937, 937, 937, 937,
367
12000, 12000, 12000, 12000,
368
51000, 51000, 51000, 51000
369
};
370
371
static void FTransform_NEON(const uint8_t* WEBP_RESTRICT src,
372
const uint8_t* WEBP_RESTRICT ref,
373
int16_t* WEBP_RESTRICT out) {
374
const int kBPS = BPS;
375
const uint8_t* src_ptr = src;
376
const uint8_t* ref_ptr = ref;
377
const int16_t* coeff16 = kCoeff16;
378
const int32_t* coeff32 = kCoeff32;
379
380
__asm__ volatile (
381
// load src into q4, q5 in high half
382
"vld1.8 {d8}, [%[src_ptr]], %[kBPS] \n"
383
"vld1.8 {d10}, [%[src_ptr]], %[kBPS] \n"
384
"vld1.8 {d9}, [%[src_ptr]], %[kBPS] \n"
385
"vld1.8 {d11}, [%[src_ptr]] \n"
386
387
// load ref into q6, q7 in high half
388
"vld1.8 {d12}, [%[ref_ptr]], %[kBPS] \n"
389
"vld1.8 {d14}, [%[ref_ptr]], %[kBPS] \n"
390
"vld1.8 {d13}, [%[ref_ptr]], %[kBPS] \n"
391
"vld1.8 {d15}, [%[ref_ptr]] \n"
392
393
// Pack the high values in to q4 and q6
394
"vtrn.32 q4, q5 \n"
395
"vtrn.32 q6, q7 \n"
396
397
// d[0-3] = src - ref
398
"vsubl.u8 q0, d8, d12 \n"
399
"vsubl.u8 q1, d9, d13 \n"
400
401
// load coeff16 into q8(d16=5352, d17=2217)
402
"vld1.16 {q8}, [%[coeff16]] \n"
403
404
// load coeff32 high half into q9 = 1812, q10 = 937
405
"vld1.32 {q9, q10}, [%[coeff32]]! \n"
406
407
// load coeff32 low half into q11=12000, q12=51000
408
"vld1.32 {q11,q12}, [%[coeff32]] \n"
409
410
// part 1
411
// Transpose. Register dN is the same as dN in C
412
"vtrn.32 d0, d2 \n"
413
"vtrn.32 d1, d3 \n"
414
"vtrn.16 d0, d1 \n"
415
"vtrn.16 d2, d3 \n"
416
417
"vadd.s16 d4, d0, d3 \n" // a0 = d0 + d3
418
"vadd.s16 d5, d1, d2 \n" // a1 = d1 + d2
419
"vsub.s16 d6, d1, d2 \n" // a2 = d1 - d2
420
"vsub.s16 d7, d0, d3 \n" // a3 = d0 - d3
421
422
"vadd.s16 d0, d4, d5 \n" // a0 + a1
423
"vshl.s16 d0, d0, #3 \n" // temp[0+i*4] = (a0+a1) << 3
424
"vsub.s16 d2, d4, d5 \n" // a0 - a1
425
"vshl.s16 d2, d2, #3 \n" // (temp[2+i*4] = (a0-a1) << 3
426
427
"vmlal.s16 q9, d7, d16 \n" // a3*5352 + 1812
428
"vmlal.s16 q10, d7, d17 \n" // a3*2217 + 937
429
"vmlal.s16 q9, d6, d17 \n" // a2*2217 + a3*5352 + 1812
430
"vmlsl.s16 q10, d6, d16 \n" // a3*2217 + 937 - a2*5352
431
432
// temp[1+i*4] = (d2*2217 + d3*5352 + 1812) >> 9
433
// temp[3+i*4] = (d3*2217 + 937 - d2*5352) >> 9
434
"vshrn.s32 d1, q9, #9 \n"
435
"vshrn.s32 d3, q10, #9 \n"
436
437
// part 2
438
// transpose d0=ip[0], d1=ip[4], d2=ip[8], d3=ip[12]
439
"vtrn.32 d0, d2 \n"
440
"vtrn.32 d1, d3 \n"
441
"vtrn.16 d0, d1 \n"
442
"vtrn.16 d2, d3 \n"
443
444
"vmov.s16 d26, #7 \n"
445
446
"vadd.s16 d4, d0, d3 \n" // a1 = ip[0] + ip[12]
447
"vadd.s16 d5, d1, d2 \n" // b1 = ip[4] + ip[8]
448
"vsub.s16 d6, d1, d2 \n" // c1 = ip[4] - ip[8]
449
"vadd.s16 d4, d4, d26 \n" // a1 + 7
450
"vsub.s16 d7, d0, d3 \n" // d1 = ip[0] - ip[12]
451
452
"vadd.s16 d0, d4, d5 \n" // op[0] = a1 + b1 + 7
453
"vsub.s16 d2, d4, d5 \n" // op[8] = a1 - b1 + 7
454
455
"vmlal.s16 q11, d7, d16 \n" // d1*5352 + 12000
456
"vmlal.s16 q12, d7, d17 \n" // d1*2217 + 51000
457
458
"vceq.s16 d4, d7, #0 \n"
459
460
"vshr.s16 d0, d0, #4 \n"
461
"vshr.s16 d2, d2, #4 \n"
462
463
"vmlal.s16 q11, d6, d17 \n" // c1*2217 + d1*5352 + 12000
464
"vmlsl.s16 q12, d6, d16 \n" // d1*2217 - c1*5352 + 51000
465
466
"vmvn d4, d4 \n" // !(d1 == 0)
467
// op[4] = (c1*2217 + d1*5352 + 12000)>>16
468
"vshrn.s32 d1, q11, #16 \n"
469
// op[4] += (d1!=0)
470
"vsub.s16 d1, d1, d4 \n"
471
// op[12]= (d1*2217 - c1*5352 + 51000)>>16
472
"vshrn.s32 d3, q12, #16 \n"
473
474
// set result to out array
475
"vst1.16 {q0, q1}, [%[out]] \n"
476
: [src_ptr] "+r"(src_ptr), [ref_ptr] "+r"(ref_ptr),
477
[coeff32] "+r"(coeff32) // modified registers
478
: [kBPS] "r"(kBPS), [coeff16] "r"(coeff16),
479
[out] "r"(out) // constants
480
: "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9",
481
"q10", "q11", "q12", "q13" // clobbered
482
);
483
}
484
485
#endif
486
487
#define LOAD_LANE_16b(VALUE, LANE) do { \
488
(VALUE) = vld1_lane_s16(src, (VALUE), (LANE)); \
489
src += stride; \
490
} while (0)
491
492
static void FTransformWHT_NEON(const int16_t* WEBP_RESTRICT src,
493
int16_t* WEBP_RESTRICT out) {
494
const int stride = 16;
495
const int16x4_t zero = vdup_n_s16(0);
496
int32x4x4_t tmp0;
497
int16x4x4_t in;
498
INIT_VECTOR4(in, zero, zero, zero, zero);
499
LOAD_LANE_16b(in.val[0], 0);
500
LOAD_LANE_16b(in.val[1], 0);
501
LOAD_LANE_16b(in.val[2], 0);
502
LOAD_LANE_16b(in.val[3], 0);
503
LOAD_LANE_16b(in.val[0], 1);
504
LOAD_LANE_16b(in.val[1], 1);
505
LOAD_LANE_16b(in.val[2], 1);
506
LOAD_LANE_16b(in.val[3], 1);
507
LOAD_LANE_16b(in.val[0], 2);
508
LOAD_LANE_16b(in.val[1], 2);
509
LOAD_LANE_16b(in.val[2], 2);
510
LOAD_LANE_16b(in.val[3], 2);
511
LOAD_LANE_16b(in.val[0], 3);
512
LOAD_LANE_16b(in.val[1], 3);
513
LOAD_LANE_16b(in.val[2], 3);
514
LOAD_LANE_16b(in.val[3], 3);
515
516
{
517
// a0 = in[0 * 16] + in[2 * 16]
518
// a1 = in[1 * 16] + in[3 * 16]
519
// a2 = in[1 * 16] - in[3 * 16]
520
// a3 = in[0 * 16] - in[2 * 16]
521
const int32x4_t a0 = vaddl_s16(in.val[0], in.val[2]);
522
const int32x4_t a1 = vaddl_s16(in.val[1], in.val[3]);
523
const int32x4_t a2 = vsubl_s16(in.val[1], in.val[3]);
524
const int32x4_t a3 = vsubl_s16(in.val[0], in.val[2]);
525
tmp0.val[0] = vaddq_s32(a0, a1);
526
tmp0.val[1] = vaddq_s32(a3, a2);
527
tmp0.val[2] = vsubq_s32(a3, a2);
528
tmp0.val[3] = vsubq_s32(a0, a1);
529
}
530
{
531
const int32x4x4_t tmp1 = Transpose4x4_NEON(tmp0);
532
// a0 = tmp[0 + i] + tmp[ 8 + i]
533
// a1 = tmp[4 + i] + tmp[12 + i]
534
// a2 = tmp[4 + i] - tmp[12 + i]
535
// a3 = tmp[0 + i] - tmp[ 8 + i]
536
const int32x4_t a0 = vaddq_s32(tmp1.val[0], tmp1.val[2]);
537
const int32x4_t a1 = vaddq_s32(tmp1.val[1], tmp1.val[3]);
538
const int32x4_t a2 = vsubq_s32(tmp1.val[1], tmp1.val[3]);
539
const int32x4_t a3 = vsubq_s32(tmp1.val[0], tmp1.val[2]);
540
const int32x4_t b0 = vhaddq_s32(a0, a1); // (a0 + a1) >> 1
541
const int32x4_t b1 = vhaddq_s32(a3, a2); // (a3 + a2) >> 1
542
const int32x4_t b2 = vhsubq_s32(a3, a2); // (a3 - a2) >> 1
543
const int32x4_t b3 = vhsubq_s32(a0, a1); // (a0 - a1) >> 1
544
const int16x4_t out0 = vmovn_s32(b0);
545
const int16x4_t out1 = vmovn_s32(b1);
546
const int16x4_t out2 = vmovn_s32(b2);
547
const int16x4_t out3 = vmovn_s32(b3);
548
549
vst1_s16(out + 0, out0);
550
vst1_s16(out + 4, out1);
551
vst1_s16(out + 8, out2);
552
vst1_s16(out + 12, out3);
553
}
554
}
555
#undef LOAD_LANE_16b
556
557
//------------------------------------------------------------------------------
558
// Texture distortion
559
//
560
// We try to match the spectral content (weighted) between source and
561
// reconstructed samples.
562
563
// a 0123, b 0123
564
// a 4567, b 4567
565
// a 89ab, b 89ab
566
// a cdef, b cdef
567
//
568
// transpose
569
//
570
// a 048c, b 048c
571
// a 159d, b 159d
572
// a 26ae, b 26ae
573
// a 37bf, b 37bf
574
//
575
static WEBP_INLINE int16x8x4_t DistoTranspose4x4S16_NEON(int16x8x4_t q4_in) {
576
const int16x8x2_t q2_tmp0 = vtrnq_s16(q4_in.val[0], q4_in.val[1]);
577
const int16x8x2_t q2_tmp1 = vtrnq_s16(q4_in.val[2], q4_in.val[3]);
578
const int32x4x2_t q2_tmp2 = vtrnq_s32(vreinterpretq_s32_s16(q2_tmp0.val[0]),
579
vreinterpretq_s32_s16(q2_tmp1.val[0]));
580
const int32x4x2_t q2_tmp3 = vtrnq_s32(vreinterpretq_s32_s16(q2_tmp0.val[1]),
581
vreinterpretq_s32_s16(q2_tmp1.val[1]));
582
q4_in.val[0] = vreinterpretq_s16_s32(q2_tmp2.val[0]);
583
q4_in.val[2] = vreinterpretq_s16_s32(q2_tmp2.val[1]);
584
q4_in.val[1] = vreinterpretq_s16_s32(q2_tmp3.val[0]);
585
q4_in.val[3] = vreinterpretq_s16_s32(q2_tmp3.val[1]);
586
return q4_in;
587
}
588
589
static WEBP_INLINE int16x8x4_t DistoHorizontalPass_NEON(
590
const int16x8x4_t q4_in) {
591
// {a0, a1} = {in[0] + in[2], in[1] + in[3]}
592
// {a3, a2} = {in[0] - in[2], in[1] - in[3]}
593
const int16x8_t q_a0 = vaddq_s16(q4_in.val[0], q4_in.val[2]);
594
const int16x8_t q_a1 = vaddq_s16(q4_in.val[1], q4_in.val[3]);
595
const int16x8_t q_a3 = vsubq_s16(q4_in.val[0], q4_in.val[2]);
596
const int16x8_t q_a2 = vsubq_s16(q4_in.val[1], q4_in.val[3]);
597
int16x8x4_t q4_out;
598
// tmp[0] = a0 + a1
599
// tmp[1] = a3 + a2
600
// tmp[2] = a3 - a2
601
// tmp[3] = a0 - a1
602
INIT_VECTOR4(q4_out,
603
vabsq_s16(vaddq_s16(q_a0, q_a1)),
604
vabsq_s16(vaddq_s16(q_a3, q_a2)),
605
vabdq_s16(q_a3, q_a2), vabdq_s16(q_a0, q_a1));
606
return q4_out;
607
}
608
609
static WEBP_INLINE int16x8x4_t DistoVerticalPass_NEON(const uint8x8x4_t q4_in) {
610
const int16x8_t q_a0 = vreinterpretq_s16_u16(vaddl_u8(q4_in.val[0],
611
q4_in.val[2]));
612
const int16x8_t q_a1 = vreinterpretq_s16_u16(vaddl_u8(q4_in.val[1],
613
q4_in.val[3]));
614
const int16x8_t q_a2 = vreinterpretq_s16_u16(vsubl_u8(q4_in.val[1],
615
q4_in.val[3]));
616
const int16x8_t q_a3 = vreinterpretq_s16_u16(vsubl_u8(q4_in.val[0],
617
q4_in.val[2]));
618
int16x8x4_t q4_out;
619
620
INIT_VECTOR4(q4_out,
621
vaddq_s16(q_a0, q_a1), vaddq_s16(q_a3, q_a2),
622
vsubq_s16(q_a3, q_a2), vsubq_s16(q_a0, q_a1));
623
return q4_out;
624
}
625
626
static WEBP_INLINE int16x4x4_t DistoLoadW_NEON(const uint16_t* w) {
627
const uint16x8_t q_w07 = vld1q_u16(&w[0]);
628
const uint16x8_t q_w8f = vld1q_u16(&w[8]);
629
int16x4x4_t d4_w;
630
INIT_VECTOR4(d4_w,
631
vget_low_s16(vreinterpretq_s16_u16(q_w07)),
632
vget_high_s16(vreinterpretq_s16_u16(q_w07)),
633
vget_low_s16(vreinterpretq_s16_u16(q_w8f)),
634
vget_high_s16(vreinterpretq_s16_u16(q_w8f)));
635
return d4_w;
636
}
637
638
static WEBP_INLINE int32x2_t DistoSum_NEON(const int16x8x4_t q4_in,
639
const int16x4x4_t d4_w) {
640
int32x2_t d_sum;
641
// sum += w[ 0] * abs(b0);
642
// sum += w[ 4] * abs(b1);
643
// sum += w[ 8] * abs(b2);
644
// sum += w[12] * abs(b3);
645
int32x4_t q_sum0 = vmull_s16(d4_w.val[0], vget_low_s16(q4_in.val[0]));
646
int32x4_t q_sum1 = vmull_s16(d4_w.val[1], vget_low_s16(q4_in.val[1]));
647
int32x4_t q_sum2 = vmull_s16(d4_w.val[2], vget_low_s16(q4_in.val[2]));
648
int32x4_t q_sum3 = vmull_s16(d4_w.val[3], vget_low_s16(q4_in.val[3]));
649
q_sum0 = vmlsl_s16(q_sum0, d4_w.val[0], vget_high_s16(q4_in.val[0]));
650
q_sum1 = vmlsl_s16(q_sum1, d4_w.val[1], vget_high_s16(q4_in.val[1]));
651
q_sum2 = vmlsl_s16(q_sum2, d4_w.val[2], vget_high_s16(q4_in.val[2]));
652
q_sum3 = vmlsl_s16(q_sum3, d4_w.val[3], vget_high_s16(q4_in.val[3]));
653
654
q_sum0 = vaddq_s32(q_sum0, q_sum1);
655
q_sum2 = vaddq_s32(q_sum2, q_sum3);
656
q_sum2 = vaddq_s32(q_sum0, q_sum2);
657
d_sum = vpadd_s32(vget_low_s32(q_sum2), vget_high_s32(q_sum2));
658
d_sum = vpadd_s32(d_sum, d_sum);
659
return d_sum;
660
}
661
662
#define LOAD_LANE_32b(src, VALUE, LANE) \
663
(VALUE) = vld1_lane_u32((const uint32_t*)(src), (VALUE), (LANE))
664
665
// Hadamard transform
666
// Returns the weighted sum of the absolute value of transformed coefficients.
667
// w[] contains a row-major 4 by 4 symmetric matrix.
668
static int Disto4x4_NEON(const uint8_t* WEBP_RESTRICT const a,
669
const uint8_t* WEBP_RESTRICT const b,
670
const uint16_t* WEBP_RESTRICT const w) {
671
uint32x2_t d_in_ab_0123 = vdup_n_u32(0);
672
uint32x2_t d_in_ab_4567 = vdup_n_u32(0);
673
uint32x2_t d_in_ab_89ab = vdup_n_u32(0);
674
uint32x2_t d_in_ab_cdef = vdup_n_u32(0);
675
uint8x8x4_t d4_in;
676
677
// load data a, b
678
LOAD_LANE_32b(a + 0 * BPS, d_in_ab_0123, 0);
679
LOAD_LANE_32b(a + 1 * BPS, d_in_ab_4567, 0);
680
LOAD_LANE_32b(a + 2 * BPS, d_in_ab_89ab, 0);
681
LOAD_LANE_32b(a + 3 * BPS, d_in_ab_cdef, 0);
682
LOAD_LANE_32b(b + 0 * BPS, d_in_ab_0123, 1);
683
LOAD_LANE_32b(b + 1 * BPS, d_in_ab_4567, 1);
684
LOAD_LANE_32b(b + 2 * BPS, d_in_ab_89ab, 1);
685
LOAD_LANE_32b(b + 3 * BPS, d_in_ab_cdef, 1);
686
INIT_VECTOR4(d4_in,
687
vreinterpret_u8_u32(d_in_ab_0123),
688
vreinterpret_u8_u32(d_in_ab_4567),
689
vreinterpret_u8_u32(d_in_ab_89ab),
690
vreinterpret_u8_u32(d_in_ab_cdef));
691
692
{
693
// Vertical pass first to avoid a transpose (vertical and horizontal passes
694
// are commutative because w/kWeightY is symmetric) and subsequent
695
// transpose.
696
const int16x8x4_t q4_v = DistoVerticalPass_NEON(d4_in);
697
const int16x4x4_t d4_w = DistoLoadW_NEON(w);
698
// horizontal pass
699
const int16x8x4_t q4_t = DistoTranspose4x4S16_NEON(q4_v);
700
const int16x8x4_t q4_h = DistoHorizontalPass_NEON(q4_t);
701
int32x2_t d_sum = DistoSum_NEON(q4_h, d4_w);
702
703
// abs(sum2 - sum1) >> 5
704
d_sum = vabs_s32(d_sum);
705
d_sum = vshr_n_s32(d_sum, 5);
706
return vget_lane_s32(d_sum, 0);
707
}
708
}
709
#undef LOAD_LANE_32b
710
711
static int Disto16x16_NEON(const uint8_t* WEBP_RESTRICT const a,
712
const uint8_t* WEBP_RESTRICT const b,
713
const uint16_t* WEBP_RESTRICT const w) {
714
int D = 0;
715
int x, y;
716
for (y = 0; y < 16 * BPS; y += 4 * BPS) {
717
for (x = 0; x < 16; x += 4) {
718
D += Disto4x4_NEON(a + x + y, b + x + y, w);
719
}
720
}
721
return D;
722
}
723
724
//------------------------------------------------------------------------------
725
726
static void CollectHistogram_NEON(const uint8_t* WEBP_RESTRICT ref,
727
const uint8_t* WEBP_RESTRICT pred,
728
int start_block, int end_block,
729
VP8Histogram* WEBP_RESTRICT const histo) {
730
const uint16x8_t max_coeff_thresh = vdupq_n_u16(MAX_COEFF_THRESH);
731
int j;
732
int distribution[MAX_COEFF_THRESH + 1] = { 0 };
733
for (j = start_block; j < end_block; ++j) {
734
int16_t out[16];
735
FTransform_NEON(ref + VP8DspScan[j], pred + VP8DspScan[j], out);
736
{
737
int k;
738
const int16x8_t a0 = vld1q_s16(out + 0);
739
const int16x8_t b0 = vld1q_s16(out + 8);
740
const uint16x8_t a1 = vreinterpretq_u16_s16(vabsq_s16(a0));
741
const uint16x8_t b1 = vreinterpretq_u16_s16(vabsq_s16(b0));
742
const uint16x8_t a2 = vshrq_n_u16(a1, 3);
743
const uint16x8_t b2 = vshrq_n_u16(b1, 3);
744
const uint16x8_t a3 = vminq_u16(a2, max_coeff_thresh);
745
const uint16x8_t b3 = vminq_u16(b2, max_coeff_thresh);
746
vst1q_s16(out + 0, vreinterpretq_s16_u16(a3));
747
vst1q_s16(out + 8, vreinterpretq_s16_u16(b3));
748
// Convert coefficients to bin.
749
for (k = 0; k < 16; ++k) {
750
++distribution[out[k]];
751
}
752
}
753
}
754
VP8SetHistogramData(distribution, histo);
755
}
756
757
//------------------------------------------------------------------------------
758
759
static WEBP_INLINE void AccumulateSSE16_NEON(
760
const uint8_t* WEBP_RESTRICT const a, const uint8_t* WEBP_RESTRICT const b,
761
uint32x4_t* const sum) {
762
const uint8x16_t a0 = vld1q_u8(a);
763
const uint8x16_t b0 = vld1q_u8(b);
764
const uint8x16_t abs_diff = vabdq_u8(a0, b0);
765
const uint16x8_t prod1 = vmull_u8(vget_low_u8(abs_diff),
766
vget_low_u8(abs_diff));
767
const uint16x8_t prod2 = vmull_u8(vget_high_u8(abs_diff),
768
vget_high_u8(abs_diff));
769
/* pair-wise adds and widen */
770
const uint32x4_t sum1 = vpaddlq_u16(prod1);
771
const uint32x4_t sum2 = vpaddlq_u16(prod2);
772
*sum = vaddq_u32(*sum, vaddq_u32(sum1, sum2));
773
}
774
775
// Horizontal sum of all four uint32_t values in 'sum'.
776
static int SumToInt_NEON(uint32x4_t sum) {
777
#if WEBP_AARCH64
778
return (int)vaddvq_u32(sum);
779
#else
780
const uint64x2_t sum2 = vpaddlq_u32(sum);
781
const uint32x2_t sum3 = vadd_u32(vreinterpret_u32_u64(vget_low_u64(sum2)),
782
vreinterpret_u32_u64(vget_high_u64(sum2)));
783
return (int)vget_lane_u32(sum3, 0);
784
#endif
785
}
786
787
static int SSE16x16_NEON(const uint8_t* WEBP_RESTRICT a,
788
const uint8_t* WEBP_RESTRICT b) {
789
uint32x4_t sum = vdupq_n_u32(0);
790
int y;
791
for (y = 0; y < 16; ++y) {
792
AccumulateSSE16_NEON(a + y * BPS, b + y * BPS, &sum);
793
}
794
return SumToInt_NEON(sum);
795
}
796
797
static int SSE16x8_NEON(const uint8_t* WEBP_RESTRICT a,
798
const uint8_t* WEBP_RESTRICT b) {
799
uint32x4_t sum = vdupq_n_u32(0);
800
int y;
801
for (y = 0; y < 8; ++y) {
802
AccumulateSSE16_NEON(a + y * BPS, b + y * BPS, &sum);
803
}
804
return SumToInt_NEON(sum);
805
}
806
807
static int SSE8x8_NEON(const uint8_t* WEBP_RESTRICT a,
808
const uint8_t* WEBP_RESTRICT b) {
809
uint32x4_t sum = vdupq_n_u32(0);
810
int y;
811
for (y = 0; y < 8; ++y) {
812
const uint8x8_t a0 = vld1_u8(a + y * BPS);
813
const uint8x8_t b0 = vld1_u8(b + y * BPS);
814
const uint8x8_t abs_diff = vabd_u8(a0, b0);
815
const uint16x8_t prod = vmull_u8(abs_diff, abs_diff);
816
sum = vpadalq_u16(sum, prod);
817
}
818
return SumToInt_NEON(sum);
819
}
820
821
static int SSE4x4_NEON(const uint8_t* WEBP_RESTRICT a,
822
const uint8_t* WEBP_RESTRICT b) {
823
const uint8x16_t a0 = Load4x4_NEON(a);
824
const uint8x16_t b0 = Load4x4_NEON(b);
825
const uint8x16_t abs_diff = vabdq_u8(a0, b0);
826
const uint16x8_t prod1 = vmull_u8(vget_low_u8(abs_diff),
827
vget_low_u8(abs_diff));
828
const uint16x8_t prod2 = vmull_u8(vget_high_u8(abs_diff),
829
vget_high_u8(abs_diff));
830
/* pair-wise adds and widen */
831
const uint32x4_t sum1 = vpaddlq_u16(prod1);
832
const uint32x4_t sum2 = vpaddlq_u16(prod2);
833
return SumToInt_NEON(vaddq_u32(sum1, sum2));
834
}
835
836
//------------------------------------------------------------------------------
837
838
// Compilation with gcc-4.6.x is problematic for now.
839
#if !defined(WORK_AROUND_GCC)
840
841
static int16x8_t Quantize_NEON(int16_t* WEBP_RESTRICT const in,
842
const VP8Matrix* WEBP_RESTRICT const mtx,
843
int offset) {
844
const uint16x8_t sharp = vld1q_u16(&mtx->sharpen_[offset]);
845
const uint16x8_t q = vld1q_u16(&mtx->q_[offset]);
846
const uint16x8_t iq = vld1q_u16(&mtx->iq_[offset]);
847
const uint32x4_t bias0 = vld1q_u32(&mtx->bias_[offset + 0]);
848
const uint32x4_t bias1 = vld1q_u32(&mtx->bias_[offset + 4]);
849
850
const int16x8_t a = vld1q_s16(in + offset); // in
851
const uint16x8_t b = vreinterpretq_u16_s16(vabsq_s16(a)); // coeff = abs(in)
852
const int16x8_t sign = vshrq_n_s16(a, 15); // sign
853
const uint16x8_t c = vaddq_u16(b, sharp); // + sharpen
854
const uint32x4_t m0 = vmull_u16(vget_low_u16(c), vget_low_u16(iq));
855
const uint32x4_t m1 = vmull_u16(vget_high_u16(c), vget_high_u16(iq));
856
const uint32x4_t m2 = vhaddq_u32(m0, bias0);
857
const uint32x4_t m3 = vhaddq_u32(m1, bias1); // (coeff * iQ + bias) >> 1
858
const uint16x8_t c0 = vcombine_u16(vshrn_n_u32(m2, 16),
859
vshrn_n_u32(m3, 16)); // QFIX=17 = 16+1
860
const uint16x8_t c1 = vminq_u16(c0, vdupq_n_u16(MAX_LEVEL));
861
const int16x8_t c2 = veorq_s16(vreinterpretq_s16_u16(c1), sign);
862
const int16x8_t c3 = vsubq_s16(c2, sign); // restore sign
863
const int16x8_t c4 = vmulq_s16(c3, vreinterpretq_s16_u16(q));
864
vst1q_s16(in + offset, c4);
865
assert(QFIX == 17); // this function can't work as is if QFIX != 16+1
866
return c3;
867
}
868
869
static const uint8_t kShuffles[4][8] = {
870
{ 0, 1, 2, 3, 8, 9, 16, 17 },
871
{ 10, 11, 4, 5, 6, 7, 12, 13 },
872
{ 18, 19, 24, 25, 26, 27, 20, 21 },
873
{ 14, 15, 22, 23, 28, 29, 30, 31 }
874
};
875
876
static int QuantizeBlock_NEON(int16_t in[16], int16_t out[16],
877
const VP8Matrix* WEBP_RESTRICT const mtx) {
878
const int16x8_t out0 = Quantize_NEON(in, mtx, 0);
879
const int16x8_t out1 = Quantize_NEON(in, mtx, 8);
880
uint8x8x4_t shuffles;
881
// vtbl?_u8 are marked unavailable for iOS arm64 with Xcode < 6.3, use
882
// non-standard versions there.
883
#if defined(__APPLE__) && WEBP_AARCH64 && \
884
defined(__apple_build_version__) && (__apple_build_version__< 6020037)
885
uint8x16x2_t all_out;
886
INIT_VECTOR2(all_out, vreinterpretq_u8_s16(out0), vreinterpretq_u8_s16(out1));
887
INIT_VECTOR4(shuffles,
888
vtbl2q_u8(all_out, vld1_u8(kShuffles[0])),
889
vtbl2q_u8(all_out, vld1_u8(kShuffles[1])),
890
vtbl2q_u8(all_out, vld1_u8(kShuffles[2])),
891
vtbl2q_u8(all_out, vld1_u8(kShuffles[3])));
892
#else
893
uint8x8x4_t all_out;
894
INIT_VECTOR4(all_out,
895
vreinterpret_u8_s16(vget_low_s16(out0)),
896
vreinterpret_u8_s16(vget_high_s16(out0)),
897
vreinterpret_u8_s16(vget_low_s16(out1)),
898
vreinterpret_u8_s16(vget_high_s16(out1)));
899
INIT_VECTOR4(shuffles,
900
vtbl4_u8(all_out, vld1_u8(kShuffles[0])),
901
vtbl4_u8(all_out, vld1_u8(kShuffles[1])),
902
vtbl4_u8(all_out, vld1_u8(kShuffles[2])),
903
vtbl4_u8(all_out, vld1_u8(kShuffles[3])));
904
#endif
905
// Zigzag reordering
906
vst1_u8((uint8_t*)(out + 0), shuffles.val[0]);
907
vst1_u8((uint8_t*)(out + 4), shuffles.val[1]);
908
vst1_u8((uint8_t*)(out + 8), shuffles.val[2]);
909
vst1_u8((uint8_t*)(out + 12), shuffles.val[3]);
910
// test zeros
911
if (*(uint64_t*)(out + 0) != 0) return 1;
912
if (*(uint64_t*)(out + 4) != 0) return 1;
913
if (*(uint64_t*)(out + 8) != 0) return 1;
914
if (*(uint64_t*)(out + 12) != 0) return 1;
915
return 0;
916
}
917
918
static int Quantize2Blocks_NEON(int16_t in[32], int16_t out[32],
919
const VP8Matrix* WEBP_RESTRICT const mtx) {
920
int nz;
921
nz = QuantizeBlock_NEON(in + 0 * 16, out + 0 * 16, mtx) << 0;
922
nz |= QuantizeBlock_NEON(in + 1 * 16, out + 1 * 16, mtx) << 1;
923
return nz;
924
}
925
926
#endif // !WORK_AROUND_GCC
927
928
#if WEBP_AARCH64
929
930
#if BPS == 32
931
#define DC4_VE4_HE4_TM4_NEON(dst, tbl, res, lane) \
932
do { \
933
uint8x16_t r; \
934
r = vqtbl2q_u8(qcombined, tbl); \
935
r = vreinterpretq_u8_u32( \
936
vsetq_lane_u32(vget_lane_u32(vreinterpret_u32_u8(res), lane), \
937
vreinterpretq_u32_u8(r), 1)); \
938
vst1q_u8(dst, r); \
939
} while (0)
940
941
#define RD4_VR4_LD4_VL4_NEON(dst, tbl) \
942
do { \
943
uint8x16_t r; \
944
r = vqtbl2q_u8(qcombined, tbl); \
945
vst1q_u8(dst, r); \
946
} while (0)
947
948
static void Intra4Preds_NEON(uint8_t* WEBP_RESTRICT dst,
949
const uint8_t* WEBP_RESTRICT top) {
950
// 0 1 2 3 4 5 6 7 8 9 10 11 12 13
951
// L K J I X A B C D E F G H
952
// -5 -4 -3 -2 -1 0 1 2 3 4 5 6 7
953
static const uint8_t kLookupTbl1[64] = {
954
0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 12, 12,
955
3, 3, 3, 3, 2, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0,
956
4, 20, 21, 22, 3, 18, 2, 17, 3, 19, 4, 20, 2, 17, 1, 16,
957
2, 18, 3, 19, 1, 16, 31, 31, 1, 17, 2, 18, 31, 31, 31, 31
958
};
959
960
static const uint8_t kLookupTbl2[64] = {
961
20, 21, 22, 23, 5, 6, 7, 8, 22, 23, 24, 25, 6, 7, 8, 9,
962
19, 20, 21, 22, 20, 21, 22, 23, 23, 24, 25, 26, 22, 23, 24, 25,
963
18, 19, 20, 21, 19, 5, 6, 7, 24, 25, 26, 27, 7, 8, 9, 26,
964
17, 18, 19, 20, 18, 20, 21, 22, 25, 26, 27, 28, 23, 24, 25, 27
965
};
966
967
static const uint8_t kLookupTbl3[64] = {
968
30, 30, 30, 30, 0, 0, 0, 0, 21, 22, 23, 24, 19, 19, 19, 19,
969
30, 30, 30, 30, 0, 0, 0, 0, 21, 22, 23, 24, 18, 18, 18, 18,
970
30, 30, 30, 30, 0, 0, 0, 0, 21, 22, 23, 24, 17, 17, 17, 17,
971
30, 30, 30, 30, 0, 0, 0, 0, 21, 22, 23, 24, 16, 16, 16, 16
972
};
973
974
const uint8x16x4_t lookup_avgs1 = vld1q_u8_x4(kLookupTbl1);
975
const uint8x16x4_t lookup_avgs2 = vld1q_u8_x4(kLookupTbl2);
976
const uint8x16x4_t lookup_avgs3 = vld1q_u8_x4(kLookupTbl3);
977
978
const uint8x16_t preload = vld1q_u8(top - 5);
979
uint8x16x2_t qcombined;
980
uint8x16_t result0, result1;
981
982
uint8x16_t a = vqtbl1q_u8(preload, lookup_avgs1.val[0]);
983
uint8x16_t b = preload;
984
uint8x16_t c = vextq_u8(a, a, 2);
985
986
uint8x16_t avg3_all = vrhaddq_u8(vhaddq_u8(a, c), b);
987
uint8x16_t avg2_all = vrhaddq_u8(a, b);
988
989
uint8x8_t preload_x8, sub_a, sub_c;
990
uint8_t result_u8;
991
uint8x8_t res_lo, res_hi;
992
uint8x16_t full_b;
993
uint16x8_t sub, sum_lo, sum_hi;
994
995
preload_x8 = vget_low_u8(c);
996
preload_x8 = vset_lane_u8(vgetq_lane_u8(preload, 0), preload_x8, 3);
997
998
result_u8 = (vaddlv_u8(preload_x8) + 4) >> 3;
999
1000
avg3_all = vsetq_lane_u8(vgetq_lane_u8(preload, 0), avg3_all, 15);
1001
avg3_all = vsetq_lane_u8(result_u8, avg3_all, 14);
1002
1003
qcombined.val[0] = avg2_all;
1004
qcombined.val[1] = avg3_all;
1005
1006
sub_a = vdup_laneq_u8(preload, 4);
1007
1008
// preload = {a,b,c,d,...} => full_b = {d,d,d,d,c,c,c,c,b,b,b,b,a,a,a,a}
1009
full_b = vqtbl1q_u8(preload, lookup_avgs1.val[1]);
1010
// preload = {a,b,c,d,...} => sub_c = {a,b,c,d,a,b,c,d,a,b,c,d,a,b,c,d}
1011
sub_c = vreinterpret_u8_u32(vdup_n_u32(
1012
vgetq_lane_u32(vreinterpretq_u32_u8(vextq_u8(preload, preload, 5)), 0)));
1013
1014
sub = vsubl_u8(sub_c, sub_a);
1015
sum_lo = vaddw_u8(sub, vget_low_u8(full_b));
1016
res_lo = vqmovun_s16(vreinterpretq_s16_u16(sum_lo));
1017
1018
sum_hi = vaddw_u8(sub, vget_high_u8(full_b));
1019
res_hi = vqmovun_s16(vreinterpretq_s16_u16(sum_hi));
1020
1021
// DC4, VE4, HE4, TM4
1022
DC4_VE4_HE4_TM4_NEON(dst + I4DC4 + BPS * 0, lookup_avgs3.val[0], res_lo, 0);
1023
DC4_VE4_HE4_TM4_NEON(dst + I4DC4 + BPS * 1, lookup_avgs3.val[1], res_lo, 1);
1024
DC4_VE4_HE4_TM4_NEON(dst + I4DC4 + BPS * 2, lookup_avgs3.val[2], res_hi, 0);
1025
DC4_VE4_HE4_TM4_NEON(dst + I4DC4 + BPS * 3, lookup_avgs3.val[3], res_hi, 1);
1026
1027
// RD4, VR4, LD4, VL4
1028
RD4_VR4_LD4_VL4_NEON(dst + I4RD4 + BPS * 0, lookup_avgs2.val[0]);
1029
RD4_VR4_LD4_VL4_NEON(dst + I4RD4 + BPS * 1, lookup_avgs2.val[1]);
1030
RD4_VR4_LD4_VL4_NEON(dst + I4RD4 + BPS * 2, lookup_avgs2.val[2]);
1031
RD4_VR4_LD4_VL4_NEON(dst + I4RD4 + BPS * 3, lookup_avgs2.val[3]);
1032
1033
// HD4, HU4
1034
result0 = vqtbl2q_u8(qcombined, lookup_avgs1.val[2]);
1035
result1 = vqtbl2q_u8(qcombined, lookup_avgs1.val[3]);
1036
1037
vst1_u8(dst + I4HD4 + BPS * 0, vget_low_u8(result0));
1038
vst1_u8(dst + I4HD4 + BPS * 1, vget_high_u8(result0));
1039
vst1_u8(dst + I4HD4 + BPS * 2, vget_low_u8(result1));
1040
vst1_u8(dst + I4HD4 + BPS * 3, vget_high_u8(result1));
1041
}
1042
#endif // BPS == 32
1043
1044
static WEBP_INLINE void Fill_NEON(uint8_t* dst, const uint8_t value) {
1045
uint8x16_t a = vdupq_n_u8(value);
1046
int i;
1047
for (i = 0; i < 16; i++) {
1048
vst1q_u8(dst + BPS * i, a);
1049
}
1050
}
1051
1052
static WEBP_INLINE void Fill16_NEON(uint8_t* dst, const uint8_t* src) {
1053
uint8x16_t a = vld1q_u8(src);
1054
int i;
1055
for (i = 0; i < 16; i++) {
1056
vst1q_u8(dst + BPS * i, a);
1057
}
1058
}
1059
1060
static WEBP_INLINE void HorizontalPred16_NEON(uint8_t* dst,
1061
const uint8_t* left) {
1062
uint8x16_t a;
1063
1064
if (left == NULL) {
1065
Fill_NEON(dst, 129);
1066
return;
1067
}
1068
1069
a = vld1q_u8(left + 0);
1070
vst1q_u8(dst + BPS * 0, vdupq_laneq_u8(a, 0));
1071
vst1q_u8(dst + BPS * 1, vdupq_laneq_u8(a, 1));
1072
vst1q_u8(dst + BPS * 2, vdupq_laneq_u8(a, 2));
1073
vst1q_u8(dst + BPS * 3, vdupq_laneq_u8(a, 3));
1074
vst1q_u8(dst + BPS * 4, vdupq_laneq_u8(a, 4));
1075
vst1q_u8(dst + BPS * 5, vdupq_laneq_u8(a, 5));
1076
vst1q_u8(dst + BPS * 6, vdupq_laneq_u8(a, 6));
1077
vst1q_u8(dst + BPS * 7, vdupq_laneq_u8(a, 7));
1078
vst1q_u8(dst + BPS * 8, vdupq_laneq_u8(a, 8));
1079
vst1q_u8(dst + BPS * 9, vdupq_laneq_u8(a, 9));
1080
vst1q_u8(dst + BPS * 10, vdupq_laneq_u8(a, 10));
1081
vst1q_u8(dst + BPS * 11, vdupq_laneq_u8(a, 11));
1082
vst1q_u8(dst + BPS * 12, vdupq_laneq_u8(a, 12));
1083
vst1q_u8(dst + BPS * 13, vdupq_laneq_u8(a, 13));
1084
vst1q_u8(dst + BPS * 14, vdupq_laneq_u8(a, 14));
1085
vst1q_u8(dst + BPS * 15, vdupq_laneq_u8(a, 15));
1086
}
1087
1088
static WEBP_INLINE void VerticalPred16_NEON(uint8_t* dst, const uint8_t* top) {
1089
if (top != NULL) {
1090
Fill16_NEON(dst, top);
1091
} else {
1092
Fill_NEON(dst, 127);
1093
}
1094
}
1095
1096
static WEBP_INLINE void DCMode_NEON(uint8_t* dst, const uint8_t* left,
1097
const uint8_t* top) {
1098
uint8_t s;
1099
1100
if (top != NULL) {
1101
uint16_t dc;
1102
dc = vaddlvq_u8(vld1q_u8(top));
1103
if (left != NULL) {
1104
// top and left present.
1105
dc += vaddlvq_u8(vld1q_u8(left));
1106
s = vqrshrnh_n_u16(dc, 5);
1107
} else {
1108
// top but no left.
1109
s = vqrshrnh_n_u16(dc, 4);
1110
}
1111
} else {
1112
if (left != NULL) {
1113
uint16_t dc;
1114
// left but no top.
1115
dc = vaddlvq_u8(vld1q_u8(left));
1116
s = vqrshrnh_n_u16(dc, 4);
1117
} else {
1118
// No top, no left, nothing.
1119
s = 0x80;
1120
}
1121
}
1122
Fill_NEON(dst, s);
1123
}
1124
1125
static WEBP_INLINE void TrueMotionHelper_NEON(uint8_t* dst,
1126
const uint8x8_t outer,
1127
const uint8x8x2_t inner,
1128
const uint16x8_t a, int i,
1129
const int n) {
1130
uint8x8_t d1, d2;
1131
uint16x8_t r1, r2;
1132
1133
r1 = vaddl_u8(outer, inner.val[0]);
1134
r1 = vqsubq_u16(r1, a);
1135
d1 = vqmovun_s16(vreinterpretq_s16_u16(r1));
1136
r2 = vaddl_u8(outer, inner.val[1]);
1137
r2 = vqsubq_u16(r2, a);
1138
d2 = vqmovun_s16(vreinterpretq_s16_u16(r2));
1139
vst1_u8(dst + BPS * (i * 4 + n), d1);
1140
vst1_u8(dst + BPS * (i * 4 + n) + 8, d2);
1141
}
1142
1143
static WEBP_INLINE void TrueMotion_NEON(uint8_t* dst, const uint8_t* left,
1144
const uint8_t* top) {
1145
int i;
1146
uint16x8_t a;
1147
uint8x8x2_t inner;
1148
1149
if (left == NULL) {
1150
// True motion without left samples (hence: with default 129 value) is
1151
// equivalent to VE prediction where you just copy the top samples.
1152
// Note that if top samples are not available, the default value is then
1153
// 129, and not 127 as in the VerticalPred case.
1154
if (top != NULL) {
1155
VerticalPred16_NEON(dst, top);
1156
} else {
1157
Fill_NEON(dst, 129);
1158
}
1159
return;
1160
}
1161
1162
// left is not NULL.
1163
if (top == NULL) {
1164
HorizontalPred16_NEON(dst, left);
1165
return;
1166
}
1167
1168
// Neither left nor top are NULL.
1169
a = vdupq_n_u16(left[-1]);
1170
inner = vld1_u8_x2(top);
1171
1172
for (i = 0; i < 4; i++) {
1173
const uint8x8x4_t outer = vld4_dup_u8(&left[i * 4]);
1174
1175
TrueMotionHelper_NEON(dst, outer.val[0], inner, a, i, 0);
1176
TrueMotionHelper_NEON(dst, outer.val[1], inner, a, i, 1);
1177
TrueMotionHelper_NEON(dst, outer.val[2], inner, a, i, 2);
1178
TrueMotionHelper_NEON(dst, outer.val[3], inner, a, i, 3);
1179
}
1180
}
1181
1182
static void Intra16Preds_NEON(uint8_t* WEBP_RESTRICT dst,
1183
const uint8_t* WEBP_RESTRICT left,
1184
const uint8_t* WEBP_RESTRICT top) {
1185
DCMode_NEON(I16DC16 + dst, left, top);
1186
VerticalPred16_NEON(I16VE16 + dst, top);
1187
HorizontalPred16_NEON(I16HE16 + dst, left);
1188
TrueMotion_NEON(I16TM16 + dst, left, top);
1189
}
1190
1191
#endif // WEBP_AARCH64
1192
1193
//------------------------------------------------------------------------------
1194
// Entry point
1195
1196
extern void VP8EncDspInitNEON(void);
1197
1198
WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspInitNEON(void) {
1199
VP8ITransform = ITransform_NEON;
1200
VP8FTransform = FTransform_NEON;
1201
1202
VP8FTransformWHT = FTransformWHT_NEON;
1203
1204
VP8TDisto4x4 = Disto4x4_NEON;
1205
VP8TDisto16x16 = Disto16x16_NEON;
1206
VP8CollectHistogram = CollectHistogram_NEON;
1207
1208
VP8SSE16x16 = SSE16x16_NEON;
1209
VP8SSE16x8 = SSE16x8_NEON;
1210
VP8SSE8x8 = SSE8x8_NEON;
1211
VP8SSE4x4 = SSE4x4_NEON;
1212
1213
#if WEBP_AARCH64
1214
#if BPS == 32
1215
VP8EncPredLuma4 = Intra4Preds_NEON;
1216
#endif
1217
VP8EncPredLuma16 = Intra16Preds_NEON;
1218
#endif
1219
1220
#if !defined(WORK_AROUND_GCC)
1221
VP8EncQuantizeBlock = QuantizeBlock_NEON;
1222
VP8EncQuantize2Blocks = Quantize2Blocks_NEON;
1223
VP8EncQuantizeBlockWHT = QuantizeBlock_NEON;
1224
#endif
1225
}
1226
1227
#else // !WEBP_USE_NEON
1228
1229
WEBP_DSP_INIT_STUB(VP8EncDspInitNEON)
1230
1231
#endif // WEBP_USE_NEON
1232
1233