Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
godotengine
GitHub Repository: godotengine/godot
Path: blob/master/thirdparty/libwebp/src/dsp/enc_neon.c
21646 views
1
// Copyright 2012 Google Inc. All Rights Reserved.
2
//
3
// Use of this source code is governed by a BSD-style license
4
// that can be found in the COPYING file in the root of the source
5
// tree. An additional intellectual property rights grant can be found
6
// in the file PATENTS. All contributing project authors may
7
// be found in the AUTHORS file in the root of the source tree.
8
// -----------------------------------------------------------------------------
9
//
10
// ARM NEON version of speed-critical encoding functions.
11
//
12
// adapted from libvpx (https://www.webmproject.org/code/)
13
14
#include "src/dsp/dsp.h"
15
16
#if defined(WEBP_USE_NEON)
17
18
#include <assert.h>
19
20
#include "src/dsp/neon.h"
21
#include "src/enc/vp8i_enc.h"
22
23
//------------------------------------------------------------------------------
24
// Transforms (Paragraph 14.4)
25
26
// Inverse transform.
27
// This code is pretty much the same as TransformOne in the dec_neon.c, except
28
// for subtraction to *ref. See the comments there for algorithmic explanations.
29
30
static const int16_t kC1 = WEBP_TRANSFORM_AC3_C1;
31
static const int16_t kC2 =
32
WEBP_TRANSFORM_AC3_C2 / 2; // half of kC2, actually. See comment above.
33
34
// This code works but is *slower* than the inlined-asm version below
35
// (with gcc-4.6). So we disable it for now. Later, it'll be conditional to
36
// WEBP_USE_INTRINSICS define.
37
// With gcc-4.8, it's a little faster speed than inlined-assembly.
38
#if defined(WEBP_USE_INTRINSICS)
39
40
// Treats 'v' as an uint8x8_t and zero extends to an int16x8_t.
41
static WEBP_INLINE int16x8_t ConvertU8ToS16_NEON(uint32x2_t v) {
42
return vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_u32(v)));
43
}
44
45
// Performs unsigned 8b saturation on 'dst01' and 'dst23' storing the result
46
// to the corresponding rows of 'dst'.
47
static WEBP_INLINE void SaturateAndStore4x4_NEON(uint8_t* const dst,
48
const int16x8_t dst01,
49
const int16x8_t dst23) {
50
// Unsigned saturate to 8b.
51
const uint8x8_t dst01_u8 = vqmovun_s16(dst01);
52
const uint8x8_t dst23_u8 = vqmovun_s16(dst23);
53
54
// Store the results.
55
vst1_lane_u32((uint32_t*)(dst + 0 * BPS), vreinterpret_u32_u8(dst01_u8), 0);
56
vst1_lane_u32((uint32_t*)(dst + 1 * BPS), vreinterpret_u32_u8(dst01_u8), 1);
57
vst1_lane_u32((uint32_t*)(dst + 2 * BPS), vreinterpret_u32_u8(dst23_u8), 0);
58
vst1_lane_u32((uint32_t*)(dst + 3 * BPS), vreinterpret_u32_u8(dst23_u8), 1);
59
}
60
61
static WEBP_INLINE void Add4x4_NEON(const int16x8_t row01,
62
const int16x8_t row23,
63
const uint8_t* WEBP_RESTRICT const ref,
64
uint8_t* WEBP_RESTRICT const dst) {
65
uint32x2_t dst01 = vdup_n_u32(0);
66
uint32x2_t dst23 = vdup_n_u32(0);
67
68
// Load the source pixels.
69
dst01 = vld1_lane_u32((uint32_t*)(ref + 0 * BPS), dst01, 0);
70
dst23 = vld1_lane_u32((uint32_t*)(ref + 2 * BPS), dst23, 0);
71
dst01 = vld1_lane_u32((uint32_t*)(ref + 1 * BPS), dst01, 1);
72
dst23 = vld1_lane_u32((uint32_t*)(ref + 3 * BPS), dst23, 1);
73
74
{
75
// Convert to 16b.
76
const int16x8_t dst01_s16 = ConvertU8ToS16_NEON(dst01);
77
const int16x8_t dst23_s16 = ConvertU8ToS16_NEON(dst23);
78
79
// Descale with rounding.
80
const int16x8_t out01 = vrsraq_n_s16(dst01_s16, row01, 3);
81
const int16x8_t out23 = vrsraq_n_s16(dst23_s16, row23, 3);
82
// Add the inverse transform.
83
SaturateAndStore4x4_NEON(dst, out01, out23);
84
}
85
}
86
87
static WEBP_INLINE void Transpose8x2_NEON(const int16x8_t in0,
88
const int16x8_t in1,
89
int16x8x2_t* const out) {
90
// a0 a1 a2 a3 | b0 b1 b2 b3 => a0 b0 c0 d0 | a1 b1 c1 d1
91
// c0 c1 c2 c3 | d0 d1 d2 d3 a2 b2 c2 d2 | a3 b3 c3 d3
92
const int16x8x2_t tmp0 = vzipq_s16(in0, in1); // a0 c0 a1 c1 a2 c2 ...
93
// b0 d0 b1 d1 b2 d2 ...
94
*out = vzipq_s16(tmp0.val[0], tmp0.val[1]);
95
}
96
97
static WEBP_INLINE void TransformPass_NEON(int16x8x2_t* const rows) {
98
// {rows} = in0 | in4
99
// in8 | in12
100
// B1 = in4 | in12
101
const int16x8_t B1 =
102
vcombine_s16(vget_high_s16(rows->val[0]), vget_high_s16(rows->val[1]));
103
// C0 = kC1 * in4 | kC1 * in12
104
// C1 = kC2 * in4 | kC2 * in12
105
const int16x8_t C0 = vsraq_n_s16(B1, vqdmulhq_n_s16(B1, kC1), 1);
106
const int16x8_t C1 = vqdmulhq_n_s16(B1, kC2);
107
const int16x4_t a = vqadd_s16(vget_low_s16(rows->val[0]),
108
vget_low_s16(rows->val[1])); // in0 + in8
109
const int16x4_t b = vqsub_s16(vget_low_s16(rows->val[0]),
110
vget_low_s16(rows->val[1])); // in0 - in8
111
// c = kC2 * in4 - kC1 * in12
112
// d = kC1 * in4 + kC2 * in12
113
const int16x4_t c = vqsub_s16(vget_low_s16(C1), vget_high_s16(C0));
114
const int16x4_t d = vqadd_s16(vget_low_s16(C0), vget_high_s16(C1));
115
const int16x8_t D0 = vcombine_s16(a, b); // D0 = a | b
116
const int16x8_t D1 = vcombine_s16(d, c); // D1 = d | c
117
const int16x8_t E0 = vqaddq_s16(D0, D1); // a+d | b+c
118
const int16x8_t E_tmp = vqsubq_s16(D0, D1); // a-d | b-c
119
const int16x8_t E1 = vcombine_s16(vget_high_s16(E_tmp), vget_low_s16(E_tmp));
120
Transpose8x2_NEON(E0, E1, rows);
121
}
122
123
static void ITransformOne_NEON(const uint8_t* WEBP_RESTRICT ref,
124
const int16_t* WEBP_RESTRICT in,
125
uint8_t* WEBP_RESTRICT dst) {
126
int16x8x2_t rows;
127
INIT_VECTOR2(rows, vld1q_s16(in + 0), vld1q_s16(in + 8));
128
TransformPass_NEON(&rows);
129
TransformPass_NEON(&rows);
130
Add4x4_NEON(rows.val[0], rows.val[1], ref, dst);
131
}
132
133
#else
134
135
static void ITransformOne_NEON(const uint8_t* WEBP_RESTRICT ref,
136
const int16_t* WEBP_RESTRICT in,
137
uint8_t* WEBP_RESTRICT dst) {
138
const int kBPS = BPS;
139
const int16_t kC1C2[] = { kC1, kC2, 0, 0 };
140
141
__asm__ volatile (
142
"vld1.16 {q1, q2}, [%[in]] \n"
143
"vld1.16 {d0}, [%[kC1C2]] \n"
144
145
// d2: in[0]
146
// d3: in[8]
147
// d4: in[4]
148
// d5: in[12]
149
"vswp d3, d4 \n"
150
151
// q8 = {in[4], in[12]} * kC1 * 2 >> 16
152
// q9 = {in[4], in[12]} * kC2 >> 16
153
"vqdmulh.s16 q8, q2, d0[0] \n"
154
"vqdmulh.s16 q9, q2, d0[1] \n"
155
156
// d22 = a = in[0] + in[8]
157
// d23 = b = in[0] - in[8]
158
"vqadd.s16 d22, d2, d3 \n"
159
"vqsub.s16 d23, d2, d3 \n"
160
161
// q8 = in[4]/[12] * kC1 >> 16
162
"vshr.s16 q8, q8, #1 \n"
163
164
// Add {in[4], in[12]} back after the multiplication.
165
"vqadd.s16 q8, q2, q8 \n"
166
167
// d20 = c = in[4]*kC2 - in[12]*kC1
168
// d21 = d = in[4]*kC1 + in[12]*kC2
169
"vqsub.s16 d20, d18, d17 \n"
170
"vqadd.s16 d21, d19, d16 \n"
171
172
// d2 = tmp[0] = a + d
173
// d3 = tmp[1] = b + c
174
// d4 = tmp[2] = b - c
175
// d5 = tmp[3] = a - d
176
"vqadd.s16 d2, d22, d21 \n"
177
"vqadd.s16 d3, d23, d20 \n"
178
"vqsub.s16 d4, d23, d20 \n"
179
"vqsub.s16 d5, d22, d21 \n"
180
181
"vzip.16 q1, q2 \n"
182
"vzip.16 q1, q2 \n"
183
184
"vswp d3, d4 \n"
185
186
// q8 = {tmp[4], tmp[12]} * kC1 * 2 >> 16
187
// q9 = {tmp[4], tmp[12]} * kC2 >> 16
188
"vqdmulh.s16 q8, q2, d0[0] \n"
189
"vqdmulh.s16 q9, q2, d0[1] \n"
190
191
// d22 = a = tmp[0] + tmp[8]
192
// d23 = b = tmp[0] - tmp[8]
193
"vqadd.s16 d22, d2, d3 \n"
194
"vqsub.s16 d23, d2, d3 \n"
195
196
"vshr.s16 q8, q8, #1 \n"
197
"vqadd.s16 q8, q2, q8 \n"
198
199
// d20 = c = in[4]*kC2 - in[12]*kC1
200
// d21 = d = in[4]*kC1 + in[12]*kC2
201
"vqsub.s16 d20, d18, d17 \n"
202
"vqadd.s16 d21, d19, d16 \n"
203
204
// d2 = tmp[0] = a + d
205
// d3 = tmp[1] = b + c
206
// d4 = tmp[2] = b - c
207
// d5 = tmp[3] = a - d
208
"vqadd.s16 d2, d22, d21 \n"
209
"vqadd.s16 d3, d23, d20 \n"
210
"vqsub.s16 d4, d23, d20 \n"
211
"vqsub.s16 d5, d22, d21 \n"
212
213
"vld1.32 d6[0], [%[ref]], %[kBPS] \n"
214
"vld1.32 d6[1], [%[ref]], %[kBPS] \n"
215
"vld1.32 d7[0], [%[ref]], %[kBPS] \n"
216
"vld1.32 d7[1], [%[ref]], %[kBPS] \n"
217
218
"sub %[ref], %[ref], %[kBPS], lsl #2 \n"
219
220
// (val) + 4 >> 3
221
"vrshr.s16 d2, d2, #3 \n"
222
"vrshr.s16 d3, d3, #3 \n"
223
"vrshr.s16 d4, d4, #3 \n"
224
"vrshr.s16 d5, d5, #3 \n"
225
226
"vzip.16 q1, q2 \n"
227
"vzip.16 q1, q2 \n"
228
229
// Must accumulate before saturating
230
"vmovl.u8 q8, d6 \n"
231
"vmovl.u8 q9, d7 \n"
232
233
"vqadd.s16 q1, q1, q8 \n"
234
"vqadd.s16 q2, q2, q9 \n"
235
236
"vqmovun.s16 d0, q1 \n"
237
"vqmovun.s16 d1, q2 \n"
238
239
"vst1.32 d0[0], [%[dst]], %[kBPS] \n"
240
"vst1.32 d0[1], [%[dst]], %[kBPS] \n"
241
"vst1.32 d1[0], [%[dst]], %[kBPS] \n"
242
"vst1.32 d1[1], [%[dst]] \n"
243
244
: [in] "+r"(in), [dst] "+r"(dst) // modified registers
245
: [kBPS] "r"(kBPS), [kC1C2] "r"(kC1C2), [ref] "r"(ref) // constants
246
: "memory", "q0", "q1", "q2", "q8", "q9", "q10", "q11" // clobbered
247
);
248
}
249
250
#endif // WEBP_USE_INTRINSICS
251
252
static void ITransform_NEON(const uint8_t* WEBP_RESTRICT ref,
253
const int16_t* WEBP_RESTRICT in,
254
uint8_t* WEBP_RESTRICT dst, int do_two) {
255
ITransformOne_NEON(ref, in, dst);
256
if (do_two) {
257
ITransformOne_NEON(ref + 4, in + 16, dst + 4);
258
}
259
}
260
261
// Load all 4x4 pixels into a single uint8x16_t variable.
262
static uint8x16_t Load4x4_NEON(const uint8_t* src) {
263
uint32x4_t out = vdupq_n_u32(0);
264
out = vld1q_lane_u32((const uint32_t*)(src + 0 * BPS), out, 0);
265
out = vld1q_lane_u32((const uint32_t*)(src + 1 * BPS), out, 1);
266
out = vld1q_lane_u32((const uint32_t*)(src + 2 * BPS), out, 2);
267
out = vld1q_lane_u32((const uint32_t*)(src + 3 * BPS), out, 3);
268
return vreinterpretq_u8_u32(out);
269
}
270
271
// Forward transform.
272
273
#if defined(WEBP_USE_INTRINSICS)
274
275
static WEBP_INLINE void Transpose4x4_S16_NEON(const int16x4_t A,
276
const int16x4_t B,
277
const int16x4_t C,
278
const int16x4_t D,
279
int16x8_t* const out01,
280
int16x8_t* const out32) {
281
const int16x4x2_t AB = vtrn_s16(A, B);
282
const int16x4x2_t CD = vtrn_s16(C, D);
283
const int32x2x2_t tmp02 = vtrn_s32(vreinterpret_s32_s16(AB.val[0]),
284
vreinterpret_s32_s16(CD.val[0]));
285
const int32x2x2_t tmp13 = vtrn_s32(vreinterpret_s32_s16(AB.val[1]),
286
vreinterpret_s32_s16(CD.val[1]));
287
*out01 = vreinterpretq_s16_s64(
288
vcombine_s64(vreinterpret_s64_s32(tmp02.val[0]),
289
vreinterpret_s64_s32(tmp13.val[0])));
290
*out32 = vreinterpretq_s16_s64(
291
vcombine_s64(vreinterpret_s64_s32(tmp13.val[1]),
292
vreinterpret_s64_s32(tmp02.val[1])));
293
}
294
295
static WEBP_INLINE int16x8_t DiffU8ToS16_NEON(const uint8x8_t a,
296
const uint8x8_t b) {
297
return vreinterpretq_s16_u16(vsubl_u8(a, b));
298
}
299
300
static void FTransform_NEON(const uint8_t* WEBP_RESTRICT src,
301
const uint8_t* WEBP_RESTRICT ref,
302
int16_t* WEBP_RESTRICT out) {
303
int16x8_t d0d1, d3d2; // working 4x4 int16 variables
304
{
305
const uint8x16_t S0 = Load4x4_NEON(src);
306
const uint8x16_t R0 = Load4x4_NEON(ref);
307
const int16x8_t D0D1 = DiffU8ToS16_NEON(vget_low_u8(S0), vget_low_u8(R0));
308
const int16x8_t D2D3 = DiffU8ToS16_NEON(vget_high_u8(S0), vget_high_u8(R0));
309
const int16x4_t D0 = vget_low_s16(D0D1);
310
const int16x4_t D1 = vget_high_s16(D0D1);
311
const int16x4_t D2 = vget_low_s16(D2D3);
312
const int16x4_t D3 = vget_high_s16(D2D3);
313
Transpose4x4_S16_NEON(D0, D1, D2, D3, &d0d1, &d3d2);
314
}
315
{ // 1rst pass
316
const int32x4_t kCst937 = vdupq_n_s32(937);
317
const int32x4_t kCst1812 = vdupq_n_s32(1812);
318
const int16x8_t a0a1 = vaddq_s16(d0d1, d3d2); // d0+d3 | d1+d2 (=a0|a1)
319
const int16x8_t a3a2 = vsubq_s16(d0d1, d3d2); // d0-d3 | d1-d2 (=a3|a2)
320
const int16x8_t a0a1_2 = vshlq_n_s16(a0a1, 3);
321
const int16x4_t tmp0 = vadd_s16(vget_low_s16(a0a1_2),
322
vget_high_s16(a0a1_2));
323
const int16x4_t tmp2 = vsub_s16(vget_low_s16(a0a1_2),
324
vget_high_s16(a0a1_2));
325
const int32x4_t a3_2217 = vmull_n_s16(vget_low_s16(a3a2), 2217);
326
const int32x4_t a2_2217 = vmull_n_s16(vget_high_s16(a3a2), 2217);
327
const int32x4_t a2_p_a3 = vmlal_n_s16(a2_2217, vget_low_s16(a3a2), 5352);
328
const int32x4_t a3_m_a2 = vmlsl_n_s16(a3_2217, vget_high_s16(a3a2), 5352);
329
const int16x4_t tmp1 = vshrn_n_s32(vaddq_s32(a2_p_a3, kCst1812), 9);
330
const int16x4_t tmp3 = vshrn_n_s32(vaddq_s32(a3_m_a2, kCst937), 9);
331
Transpose4x4_S16_NEON(tmp0, tmp1, tmp2, tmp3, &d0d1, &d3d2);
332
}
333
{ // 2nd pass
334
// the (1<<16) addition is for the replacement: a3!=0 <-> 1-(a3==0)
335
const int32x4_t kCst12000 = vdupq_n_s32(12000 + (1 << 16));
336
const int32x4_t kCst51000 = vdupq_n_s32(51000);
337
const int16x8_t a0a1 = vaddq_s16(d0d1, d3d2); // d0+d3 | d1+d2 (=a0|a1)
338
const int16x8_t a3a2 = vsubq_s16(d0d1, d3d2); // d0-d3 | d1-d2 (=a3|a2)
339
const int16x4_t a0_k7 = vadd_s16(vget_low_s16(a0a1), vdup_n_s16(7));
340
const int16x4_t out0 = vshr_n_s16(vadd_s16(a0_k7, vget_high_s16(a0a1)), 4);
341
const int16x4_t out2 = vshr_n_s16(vsub_s16(a0_k7, vget_high_s16(a0a1)), 4);
342
const int32x4_t a3_2217 = vmull_n_s16(vget_low_s16(a3a2), 2217);
343
const int32x4_t a2_2217 = vmull_n_s16(vget_high_s16(a3a2), 2217);
344
const int32x4_t a2_p_a3 = vmlal_n_s16(a2_2217, vget_low_s16(a3a2), 5352);
345
const int32x4_t a3_m_a2 = vmlsl_n_s16(a3_2217, vget_high_s16(a3a2), 5352);
346
const int16x4_t tmp1 = vaddhn_s32(a2_p_a3, kCst12000);
347
const int16x4_t out3 = vaddhn_s32(a3_m_a2, kCst51000);
348
const int16x4_t a3_eq_0 =
349
vreinterpret_s16_u16(vceq_s16(vget_low_s16(a3a2), vdup_n_s16(0)));
350
const int16x4_t out1 = vadd_s16(tmp1, a3_eq_0);
351
vst1_s16(out + 0, out0);
352
vst1_s16(out + 4, out1);
353
vst1_s16(out + 8, out2);
354
vst1_s16(out + 12, out3);
355
}
356
}
357
358
#else
359
360
// adapted from vp8/encoder/arm/neon/shortfdct_neon.asm
361
static const int16_t kCoeff16[] = {
362
5352, 5352, 5352, 5352, 2217, 2217, 2217, 2217
363
};
364
static const int32_t kCoeff32[] = {
365
1812, 1812, 1812, 1812,
366
937, 937, 937, 937,
367
12000, 12000, 12000, 12000,
368
51000, 51000, 51000, 51000
369
};
370
371
static void FTransform_NEON(const uint8_t* WEBP_RESTRICT src,
372
const uint8_t* WEBP_RESTRICT ref,
373
int16_t* WEBP_RESTRICT out) {
374
const int kBPS = BPS;
375
const uint8_t* src_ptr = src;
376
const uint8_t* ref_ptr = ref;
377
const int16_t* coeff16 = kCoeff16;
378
const int32_t* coeff32 = kCoeff32;
379
380
__asm__ volatile (
381
// load src into q4, q5 in high half
382
"vld1.8 {d8}, [%[src_ptr]], %[kBPS] \n"
383
"vld1.8 {d10}, [%[src_ptr]], %[kBPS] \n"
384
"vld1.8 {d9}, [%[src_ptr]], %[kBPS] \n"
385
"vld1.8 {d11}, [%[src_ptr]] \n"
386
387
// load ref into q6, q7 in high half
388
"vld1.8 {d12}, [%[ref_ptr]], %[kBPS] \n"
389
"vld1.8 {d14}, [%[ref_ptr]], %[kBPS] \n"
390
"vld1.8 {d13}, [%[ref_ptr]], %[kBPS] \n"
391
"vld1.8 {d15}, [%[ref_ptr]] \n"
392
393
// Pack the high values in to q4 and q6
394
"vtrn.32 q4, q5 \n"
395
"vtrn.32 q6, q7 \n"
396
397
// d[0-3] = src - ref
398
"vsubl.u8 q0, d8, d12 \n"
399
"vsubl.u8 q1, d9, d13 \n"
400
401
// load coeff16 into q8(d16=5352, d17=2217)
402
"vld1.16 {q8}, [%[coeff16]] \n"
403
404
// load coeff32 high half into q9 = 1812, q10 = 937
405
"vld1.32 {q9, q10}, [%[coeff32]]! \n"
406
407
// load coeff32 low half into q11=12000, q12=51000
408
"vld1.32 {q11,q12}, [%[coeff32]] \n"
409
410
// part 1
411
// Transpose. Register dN is the same as dN in C
412
"vtrn.32 d0, d2 \n"
413
"vtrn.32 d1, d3 \n"
414
"vtrn.16 d0, d1 \n"
415
"vtrn.16 d2, d3 \n"
416
417
"vadd.s16 d4, d0, d3 \n" // a0 = d0 + d3
418
"vadd.s16 d5, d1, d2 \n" // a1 = d1 + d2
419
"vsub.s16 d6, d1, d2 \n" // a2 = d1 - d2
420
"vsub.s16 d7, d0, d3 \n" // a3 = d0 - d3
421
422
"vadd.s16 d0, d4, d5 \n" // a0 + a1
423
"vshl.s16 d0, d0, #3 \n" // temp[0+i*4] = (a0+a1) << 3
424
"vsub.s16 d2, d4, d5 \n" // a0 - a1
425
"vshl.s16 d2, d2, #3 \n" // (temp[2+i*4] = (a0-a1) << 3
426
427
"vmlal.s16 q9, d7, d16 \n" // a3*5352 + 1812
428
"vmlal.s16 q10, d7, d17 \n" // a3*2217 + 937
429
"vmlal.s16 q9, d6, d17 \n" // a2*2217 + a3*5352 + 1812
430
"vmlsl.s16 q10, d6, d16 \n" // a3*2217 + 937 - a2*5352
431
432
// temp[1+i*4] = (d2*2217 + d3*5352 + 1812) >> 9
433
// temp[3+i*4] = (d3*2217 + 937 - d2*5352) >> 9
434
"vshrn.s32 d1, q9, #9 \n"
435
"vshrn.s32 d3, q10, #9 \n"
436
437
// part 2
438
// transpose d0=ip[0], d1=ip[4], d2=ip[8], d3=ip[12]
439
"vtrn.32 d0, d2 \n"
440
"vtrn.32 d1, d3 \n"
441
"vtrn.16 d0, d1 \n"
442
"vtrn.16 d2, d3 \n"
443
444
"vmov.s16 d26, #7 \n"
445
446
"vadd.s16 d4, d0, d3 \n" // a1 = ip[0] + ip[12]
447
"vadd.s16 d5, d1, d2 \n" // b1 = ip[4] + ip[8]
448
"vsub.s16 d6, d1, d2 \n" // c1 = ip[4] - ip[8]
449
"vadd.s16 d4, d4, d26 \n" // a1 + 7
450
"vsub.s16 d7, d0, d3 \n" // d1 = ip[0] - ip[12]
451
452
"vadd.s16 d0, d4, d5 \n" // op[0] = a1 + b1 + 7
453
"vsub.s16 d2, d4, d5 \n" // op[8] = a1 - b1 + 7
454
455
"vmlal.s16 q11, d7, d16 \n" // d1*5352 + 12000
456
"vmlal.s16 q12, d7, d17 \n" // d1*2217 + 51000
457
458
"vceq.s16 d4, d7, #0 \n"
459
460
"vshr.s16 d0, d0, #4 \n"
461
"vshr.s16 d2, d2, #4 \n"
462
463
"vmlal.s16 q11, d6, d17 \n" // c1*2217 + d1*5352 + 12000
464
"vmlsl.s16 q12, d6, d16 \n" // d1*2217 - c1*5352 + 51000
465
466
"vmvn d4, d4 \n" // !(d1 == 0)
467
// op[4] = (c1*2217 + d1*5352 + 12000)>>16
468
"vshrn.s32 d1, q11, #16 \n"
469
// op[4] += (d1!=0)
470
"vsub.s16 d1, d1, d4 \n"
471
// op[12]= (d1*2217 - c1*5352 + 51000)>>16
472
"vshrn.s32 d3, q12, #16 \n"
473
474
// set result to out array
475
"vst1.16 {q0, q1}, [%[out]] \n"
476
: [src_ptr] "+r"(src_ptr), [ref_ptr] "+r"(ref_ptr),
477
[coeff32] "+r"(coeff32) // modified registers
478
: [kBPS] "r"(kBPS), [coeff16] "r"(coeff16),
479
[out] "r"(out) // constants
480
: "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9",
481
"q10", "q11", "q12", "q13" // clobbered
482
);
483
}
484
485
#endif
486
487
#define LOAD_LANE_16b(VALUE, LANE) do { \
488
(VALUE) = vld1_lane_s16(src, (VALUE), (LANE)); \
489
src += stride; \
490
} while (0)
491
492
static void FTransformWHT_NEON(const int16_t* WEBP_RESTRICT src,
493
int16_t* WEBP_RESTRICT out) {
494
const int stride = 16;
495
const int16x4_t zero = vdup_n_s16(0);
496
int32x4x4_t tmp0;
497
int16x4x4_t in;
498
INIT_VECTOR4(in, zero, zero, zero, zero);
499
LOAD_LANE_16b(in.val[0], 0);
500
LOAD_LANE_16b(in.val[1], 0);
501
LOAD_LANE_16b(in.val[2], 0);
502
LOAD_LANE_16b(in.val[3], 0);
503
LOAD_LANE_16b(in.val[0], 1);
504
LOAD_LANE_16b(in.val[1], 1);
505
LOAD_LANE_16b(in.val[2], 1);
506
LOAD_LANE_16b(in.val[3], 1);
507
LOAD_LANE_16b(in.val[0], 2);
508
LOAD_LANE_16b(in.val[1], 2);
509
LOAD_LANE_16b(in.val[2], 2);
510
LOAD_LANE_16b(in.val[3], 2);
511
LOAD_LANE_16b(in.val[0], 3);
512
LOAD_LANE_16b(in.val[1], 3);
513
LOAD_LANE_16b(in.val[2], 3);
514
LOAD_LANE_16b(in.val[3], 3);
515
516
{
517
// a0 = in[0 * 16] + in[2 * 16]
518
// a1 = in[1 * 16] + in[3 * 16]
519
// a2 = in[1 * 16] - in[3 * 16]
520
// a3 = in[0 * 16] - in[2 * 16]
521
const int32x4_t a0 = vaddl_s16(in.val[0], in.val[2]);
522
const int32x4_t a1 = vaddl_s16(in.val[1], in.val[3]);
523
const int32x4_t a2 = vsubl_s16(in.val[1], in.val[3]);
524
const int32x4_t a3 = vsubl_s16(in.val[0], in.val[2]);
525
tmp0.val[0] = vaddq_s32(a0, a1);
526
tmp0.val[1] = vaddq_s32(a3, a2);
527
tmp0.val[2] = vsubq_s32(a3, a2);
528
tmp0.val[3] = vsubq_s32(a0, a1);
529
}
530
{
531
const int32x4x4_t tmp1 = Transpose4x4_NEON(tmp0);
532
// a0 = tmp[0 + i] + tmp[ 8 + i]
533
// a1 = tmp[4 + i] + tmp[12 + i]
534
// a2 = tmp[4 + i] - tmp[12 + i]
535
// a3 = tmp[0 + i] - tmp[ 8 + i]
536
const int32x4_t a0 = vaddq_s32(tmp1.val[0], tmp1.val[2]);
537
const int32x4_t a1 = vaddq_s32(tmp1.val[1], tmp1.val[3]);
538
const int32x4_t a2 = vsubq_s32(tmp1.val[1], tmp1.val[3]);
539
const int32x4_t a3 = vsubq_s32(tmp1.val[0], tmp1.val[2]);
540
const int32x4_t b0 = vhaddq_s32(a0, a1); // (a0 + a1) >> 1
541
const int32x4_t b1 = vhaddq_s32(a3, a2); // (a3 + a2) >> 1
542
const int32x4_t b2 = vhsubq_s32(a3, a2); // (a3 - a2) >> 1
543
const int32x4_t b3 = vhsubq_s32(a0, a1); // (a0 - a1) >> 1
544
const int16x4_t out0 = vmovn_s32(b0);
545
const int16x4_t out1 = vmovn_s32(b1);
546
const int16x4_t out2 = vmovn_s32(b2);
547
const int16x4_t out3 = vmovn_s32(b3);
548
549
vst1_s16(out + 0, out0);
550
vst1_s16(out + 4, out1);
551
vst1_s16(out + 8, out2);
552
vst1_s16(out + 12, out3);
553
}
554
}
555
#undef LOAD_LANE_16b
556
557
//------------------------------------------------------------------------------
558
// Texture distortion
559
//
560
// We try to match the spectral content (weighted) between source and
561
// reconstructed samples.
562
563
// a 0123, b 0123
564
// a 4567, b 4567
565
// a 89ab, b 89ab
566
// a cdef, b cdef
567
//
568
// transpose
569
//
570
// a 048c, b 048c
571
// a 159d, b 159d
572
// a 26ae, b 26ae
573
// a 37bf, b 37bf
574
//
575
static WEBP_INLINE int16x8x4_t DistoTranspose4x4S16_NEON(int16x8x4_t q4_in) {
576
const int16x8x2_t q2_tmp0 = vtrnq_s16(q4_in.val[0], q4_in.val[1]);
577
const int16x8x2_t q2_tmp1 = vtrnq_s16(q4_in.val[2], q4_in.val[3]);
578
const int32x4x2_t q2_tmp2 = vtrnq_s32(vreinterpretq_s32_s16(q2_tmp0.val[0]),
579
vreinterpretq_s32_s16(q2_tmp1.val[0]));
580
const int32x4x2_t q2_tmp3 = vtrnq_s32(vreinterpretq_s32_s16(q2_tmp0.val[1]),
581
vreinterpretq_s32_s16(q2_tmp1.val[1]));
582
q4_in.val[0] = vreinterpretq_s16_s32(q2_tmp2.val[0]);
583
q4_in.val[2] = vreinterpretq_s16_s32(q2_tmp2.val[1]);
584
q4_in.val[1] = vreinterpretq_s16_s32(q2_tmp3.val[0]);
585
q4_in.val[3] = vreinterpretq_s16_s32(q2_tmp3.val[1]);
586
return q4_in;
587
}
588
589
static WEBP_INLINE int16x8x4_t DistoHorizontalPass_NEON(
590
const int16x8x4_t q4_in) {
591
// {a0, a1} = {in[0] + in[2], in[1] + in[3]}
592
// {a3, a2} = {in[0] - in[2], in[1] - in[3]}
593
const int16x8_t q_a0 = vaddq_s16(q4_in.val[0], q4_in.val[2]);
594
const int16x8_t q_a1 = vaddq_s16(q4_in.val[1], q4_in.val[3]);
595
const int16x8_t q_a3 = vsubq_s16(q4_in.val[0], q4_in.val[2]);
596
const int16x8_t q_a2 = vsubq_s16(q4_in.val[1], q4_in.val[3]);
597
int16x8x4_t q4_out;
598
// tmp[0] = a0 + a1
599
// tmp[1] = a3 + a2
600
// tmp[2] = a3 - a2
601
// tmp[3] = a0 - a1
602
INIT_VECTOR4(q4_out,
603
vabsq_s16(vaddq_s16(q_a0, q_a1)),
604
vabsq_s16(vaddq_s16(q_a3, q_a2)),
605
vabdq_s16(q_a3, q_a2), vabdq_s16(q_a0, q_a1));
606
return q4_out;
607
}
608
609
static WEBP_INLINE int16x8x4_t DistoVerticalPass_NEON(const uint8x8x4_t q4_in) {
610
const int16x8_t q_a0 = vreinterpretq_s16_u16(vaddl_u8(q4_in.val[0],
611
q4_in.val[2]));
612
const int16x8_t q_a1 = vreinterpretq_s16_u16(vaddl_u8(q4_in.val[1],
613
q4_in.val[3]));
614
const int16x8_t q_a2 = vreinterpretq_s16_u16(vsubl_u8(q4_in.val[1],
615
q4_in.val[3]));
616
const int16x8_t q_a3 = vreinterpretq_s16_u16(vsubl_u8(q4_in.val[0],
617
q4_in.val[2]));
618
int16x8x4_t q4_out;
619
620
INIT_VECTOR4(q4_out,
621
vaddq_s16(q_a0, q_a1), vaddq_s16(q_a3, q_a2),
622
vsubq_s16(q_a3, q_a2), vsubq_s16(q_a0, q_a1));
623
return q4_out;
624
}
625
626
static WEBP_INLINE int16x4x4_t DistoLoadW_NEON(const uint16_t* w) {
627
const uint16x8_t q_w07 = vld1q_u16(&w[0]);
628
const uint16x8_t q_w8f = vld1q_u16(&w[8]);
629
int16x4x4_t d4_w;
630
INIT_VECTOR4(d4_w,
631
vget_low_s16(vreinterpretq_s16_u16(q_w07)),
632
vget_high_s16(vreinterpretq_s16_u16(q_w07)),
633
vget_low_s16(vreinterpretq_s16_u16(q_w8f)),
634
vget_high_s16(vreinterpretq_s16_u16(q_w8f)));
635
return d4_w;
636
}
637
638
static WEBP_INLINE int32x2_t DistoSum_NEON(const int16x8x4_t q4_in,
639
const int16x4x4_t d4_w) {
640
int32x2_t d_sum;
641
// sum += w[ 0] * abs(b0);
642
// sum += w[ 4] * abs(b1);
643
// sum += w[ 8] * abs(b2);
644
// sum += w[12] * abs(b3);
645
int32x4_t q_sum0 = vmull_s16(d4_w.val[0], vget_low_s16(q4_in.val[0]));
646
int32x4_t q_sum1 = vmull_s16(d4_w.val[1], vget_low_s16(q4_in.val[1]));
647
int32x4_t q_sum2 = vmull_s16(d4_w.val[2], vget_low_s16(q4_in.val[2]));
648
int32x4_t q_sum3 = vmull_s16(d4_w.val[3], vget_low_s16(q4_in.val[3]));
649
q_sum0 = vmlsl_s16(q_sum0, d4_w.val[0], vget_high_s16(q4_in.val[0]));
650
q_sum1 = vmlsl_s16(q_sum1, d4_w.val[1], vget_high_s16(q4_in.val[1]));
651
q_sum2 = vmlsl_s16(q_sum2, d4_w.val[2], vget_high_s16(q4_in.val[2]));
652
q_sum3 = vmlsl_s16(q_sum3, d4_w.val[3], vget_high_s16(q4_in.val[3]));
653
654
q_sum0 = vaddq_s32(q_sum0, q_sum1);
655
q_sum2 = vaddq_s32(q_sum2, q_sum3);
656
q_sum2 = vaddq_s32(q_sum0, q_sum2);
657
d_sum = vpadd_s32(vget_low_s32(q_sum2), vget_high_s32(q_sum2));
658
d_sum = vpadd_s32(d_sum, d_sum);
659
return d_sum;
660
}
661
662
#define LOAD_LANE_32b(src, VALUE, LANE) \
663
(VALUE) = vld1_lane_u32((const uint32_t*)(src), (VALUE), (LANE))
664
665
// Hadamard transform
666
// Returns the weighted sum of the absolute value of transformed coefficients.
667
// w[] contains a row-major 4 by 4 symmetric matrix.
668
static int Disto4x4_NEON(const uint8_t* WEBP_RESTRICT const a,
669
const uint8_t* WEBP_RESTRICT const b,
670
const uint16_t* WEBP_RESTRICT const w) {
671
uint32x2_t d_in_ab_0123 = vdup_n_u32(0);
672
uint32x2_t d_in_ab_4567 = vdup_n_u32(0);
673
uint32x2_t d_in_ab_89ab = vdup_n_u32(0);
674
uint32x2_t d_in_ab_cdef = vdup_n_u32(0);
675
uint8x8x4_t d4_in;
676
677
// load data a, b
678
LOAD_LANE_32b(a + 0 * BPS, d_in_ab_0123, 0);
679
LOAD_LANE_32b(a + 1 * BPS, d_in_ab_4567, 0);
680
LOAD_LANE_32b(a + 2 * BPS, d_in_ab_89ab, 0);
681
LOAD_LANE_32b(a + 3 * BPS, d_in_ab_cdef, 0);
682
LOAD_LANE_32b(b + 0 * BPS, d_in_ab_0123, 1);
683
LOAD_LANE_32b(b + 1 * BPS, d_in_ab_4567, 1);
684
LOAD_LANE_32b(b + 2 * BPS, d_in_ab_89ab, 1);
685
LOAD_LANE_32b(b + 3 * BPS, d_in_ab_cdef, 1);
686
INIT_VECTOR4(d4_in,
687
vreinterpret_u8_u32(d_in_ab_0123),
688
vreinterpret_u8_u32(d_in_ab_4567),
689
vreinterpret_u8_u32(d_in_ab_89ab),
690
vreinterpret_u8_u32(d_in_ab_cdef));
691
692
{
693
// Vertical pass first to avoid a transpose (vertical and horizontal passes
694
// are commutative because w/kWeightY is symmetric) and subsequent
695
// transpose.
696
const int16x8x4_t q4_v = DistoVerticalPass_NEON(d4_in);
697
const int16x4x4_t d4_w = DistoLoadW_NEON(w);
698
// horizontal pass
699
const int16x8x4_t q4_t = DistoTranspose4x4S16_NEON(q4_v);
700
const int16x8x4_t q4_h = DistoHorizontalPass_NEON(q4_t);
701
int32x2_t d_sum = DistoSum_NEON(q4_h, d4_w);
702
703
// abs(sum2 - sum1) >> 5
704
d_sum = vabs_s32(d_sum);
705
d_sum = vshr_n_s32(d_sum, 5);
706
return vget_lane_s32(d_sum, 0);
707
}
708
}
709
#undef LOAD_LANE_32b
710
711
static int Disto16x16_NEON(const uint8_t* WEBP_RESTRICT const a,
712
const uint8_t* WEBP_RESTRICT const b,
713
const uint16_t* WEBP_RESTRICT const w) {
714
int D = 0;
715
int x, y;
716
for (y = 0; y < 16 * BPS; y += 4 * BPS) {
717
for (x = 0; x < 16; x += 4) {
718
D += Disto4x4_NEON(a + x + y, b + x + y, w);
719
}
720
}
721
return D;
722
}
723
724
//------------------------------------------------------------------------------
725
726
static void CollectHistogram_NEON(const uint8_t* WEBP_RESTRICT ref,
727
const uint8_t* WEBP_RESTRICT pred,
728
int start_block, int end_block,
729
VP8Histogram* WEBP_RESTRICT const histo) {
730
const uint16x8_t max_coeff_thresh = vdupq_n_u16(MAX_COEFF_THRESH);
731
int j;
732
int distribution[MAX_COEFF_THRESH + 1] = { 0 };
733
for (j = start_block; j < end_block; ++j) {
734
int16_t out[16];
735
FTransform_NEON(ref + VP8DspScan[j], pred + VP8DspScan[j], out);
736
{
737
int k;
738
const int16x8_t a0 = vld1q_s16(out + 0);
739
const int16x8_t b0 = vld1q_s16(out + 8);
740
const uint16x8_t a1 = vreinterpretq_u16_s16(vabsq_s16(a0));
741
const uint16x8_t b1 = vreinterpretq_u16_s16(vabsq_s16(b0));
742
const uint16x8_t a2 = vshrq_n_u16(a1, 3);
743
const uint16x8_t b2 = vshrq_n_u16(b1, 3);
744
const uint16x8_t a3 = vminq_u16(a2, max_coeff_thresh);
745
const uint16x8_t b3 = vminq_u16(b2, max_coeff_thresh);
746
vst1q_s16(out + 0, vreinterpretq_s16_u16(a3));
747
vst1q_s16(out + 8, vreinterpretq_s16_u16(b3));
748
// Convert coefficients to bin.
749
for (k = 0; k < 16; ++k) {
750
++distribution[out[k]];
751
}
752
}
753
}
754
VP8SetHistogramData(distribution, histo);
755
}
756
757
//------------------------------------------------------------------------------
758
759
static WEBP_INLINE void AccumulateSSE16_NEON(
760
const uint8_t* WEBP_RESTRICT const a, const uint8_t* WEBP_RESTRICT const b,
761
uint32x4_t* const sum) {
762
const uint8x16_t a0 = vld1q_u8(a);
763
const uint8x16_t b0 = vld1q_u8(b);
764
const uint8x16_t abs_diff = vabdq_u8(a0, b0);
765
const uint16x8_t prod1 = vmull_u8(vget_low_u8(abs_diff),
766
vget_low_u8(abs_diff));
767
const uint16x8_t prod2 = vmull_u8(vget_high_u8(abs_diff),
768
vget_high_u8(abs_diff));
769
/* pair-wise adds and widen */
770
const uint32x4_t sum1 = vpaddlq_u16(prod1);
771
const uint32x4_t sum2 = vpaddlq_u16(prod2);
772
*sum = vaddq_u32(*sum, vaddq_u32(sum1, sum2));
773
}
774
775
// Horizontal sum of all four uint32_t values in 'sum'.
776
static int SumToInt_NEON(uint32x4_t sum) {
777
#if WEBP_AARCH64
778
return (int)vaddvq_u32(sum);
779
#else
780
const uint64x2_t sum2 = vpaddlq_u32(sum);
781
const uint32x2_t sum3 = vadd_u32(vreinterpret_u32_u64(vget_low_u64(sum2)),
782
vreinterpret_u32_u64(vget_high_u64(sum2)));
783
return (int)vget_lane_u32(sum3, 0);
784
#endif
785
}
786
787
static int SSE16x16_NEON(const uint8_t* WEBP_RESTRICT a,
788
const uint8_t* WEBP_RESTRICT b) {
789
uint32x4_t sum = vdupq_n_u32(0);
790
int y;
791
for (y = 0; y < 16; ++y) {
792
AccumulateSSE16_NEON(a + y * BPS, b + y * BPS, &sum);
793
}
794
return SumToInt_NEON(sum);
795
}
796
797
static int SSE16x8_NEON(const uint8_t* WEBP_RESTRICT a,
798
const uint8_t* WEBP_RESTRICT b) {
799
uint32x4_t sum = vdupq_n_u32(0);
800
int y;
801
for (y = 0; y < 8; ++y) {
802
AccumulateSSE16_NEON(a + y * BPS, b + y * BPS, &sum);
803
}
804
return SumToInt_NEON(sum);
805
}
806
807
static int SSE8x8_NEON(const uint8_t* WEBP_RESTRICT a,
808
const uint8_t* WEBP_RESTRICT b) {
809
uint32x4_t sum = vdupq_n_u32(0);
810
int y;
811
for (y = 0; y < 8; ++y) {
812
const uint8x8_t a0 = vld1_u8(a + y * BPS);
813
const uint8x8_t b0 = vld1_u8(b + y * BPS);
814
const uint8x8_t abs_diff = vabd_u8(a0, b0);
815
const uint16x8_t prod = vmull_u8(abs_diff, abs_diff);
816
sum = vpadalq_u16(sum, prod);
817
}
818
return SumToInt_NEON(sum);
819
}
820
821
static int SSE4x4_NEON(const uint8_t* WEBP_RESTRICT a,
822
const uint8_t* WEBP_RESTRICT b) {
823
const uint8x16_t a0 = Load4x4_NEON(a);
824
const uint8x16_t b0 = Load4x4_NEON(b);
825
const uint8x16_t abs_diff = vabdq_u8(a0, b0);
826
const uint16x8_t prod1 = vmull_u8(vget_low_u8(abs_diff),
827
vget_low_u8(abs_diff));
828
const uint16x8_t prod2 = vmull_u8(vget_high_u8(abs_diff),
829
vget_high_u8(abs_diff));
830
/* pair-wise adds and widen */
831
const uint32x4_t sum1 = vpaddlq_u16(prod1);
832
const uint32x4_t sum2 = vpaddlq_u16(prod2);
833
return SumToInt_NEON(vaddq_u32(sum1, sum2));
834
}
835
836
//------------------------------------------------------------------------------
837
838
// Compilation with gcc-4.6.x is problematic for now.
839
#if !defined(WORK_AROUND_GCC)
840
841
static int16x8_t Quantize_NEON(int16_t* WEBP_RESTRICT const in,
842
const VP8Matrix* WEBP_RESTRICT const mtx,
843
int offset) {
844
const uint16x8_t sharp = vld1q_u16(&mtx->sharpen[offset]);
845
const uint16x8_t q = vld1q_u16(&mtx->q[offset]);
846
const uint16x8_t iq = vld1q_u16(&mtx->iq[offset]);
847
const uint32x4_t bias0 = vld1q_u32(&mtx->bias[offset + 0]);
848
const uint32x4_t bias1 = vld1q_u32(&mtx->bias[offset + 4]);
849
850
const int16x8_t a = vld1q_s16(in + offset); // in
851
const uint16x8_t b = vreinterpretq_u16_s16(vabsq_s16(a)); // coeff = abs(in)
852
const int16x8_t sign = vshrq_n_s16(a, 15); // sign
853
const uint16x8_t c = vaddq_u16(b, sharp); // + sharpen
854
const uint32x4_t m0 = vmull_u16(vget_low_u16(c), vget_low_u16(iq));
855
const uint32x4_t m1 = vmull_u16(vget_high_u16(c), vget_high_u16(iq));
856
const uint32x4_t m2 = vhaddq_u32(m0, bias0);
857
const uint32x4_t m3 = vhaddq_u32(m1, bias1); // (coeff * iQ + bias) >> 1
858
const uint16x8_t c0 = vcombine_u16(vshrn_n_u32(m2, 16),
859
vshrn_n_u32(m3, 16)); // QFIX=17 = 16+1
860
const uint16x8_t c1 = vminq_u16(c0, vdupq_n_u16(MAX_LEVEL));
861
const int16x8_t c2 = veorq_s16(vreinterpretq_s16_u16(c1), sign);
862
const int16x8_t c3 = vsubq_s16(c2, sign); // restore sign
863
const int16x8_t c4 = vmulq_s16(c3, vreinterpretq_s16_u16(q));
864
vst1q_s16(in + offset, c4);
865
assert(QFIX == 17); // this function can't work as is if QFIX != 16+1
866
return c3;
867
}
868
869
static const uint8_t kShuffles[4][8] = {
870
{ 0, 1, 2, 3, 8, 9, 16, 17 },
871
{ 10, 11, 4, 5, 6, 7, 12, 13 },
872
{ 18, 19, 24, 25, 26, 27, 20, 21 },
873
{ 14, 15, 22, 23, 28, 29, 30, 31 }
874
};
875
876
static int QuantizeBlock_NEON(int16_t in[16], int16_t out[16],
877
const VP8Matrix* WEBP_RESTRICT const mtx) {
878
const int16x8_t out0 = Quantize_NEON(in, mtx, 0);
879
const int16x8_t out1 = Quantize_NEON(in, mtx, 8);
880
uint8x8x4_t shuffles;
881
// vtbl?_u8 are marked unavailable for iOS arm64 with Xcode < 6.3, use
882
// non-standard versions there.
883
#if defined(__APPLE__) && WEBP_AARCH64 && \
884
defined(__apple_build_version__) && (__apple_build_version__< 6020037)
885
uint8x16x2_t all_out;
886
INIT_VECTOR2(all_out, vreinterpretq_u8_s16(out0), vreinterpretq_u8_s16(out1));
887
INIT_VECTOR4(shuffles,
888
vtbl2q_u8(all_out, vld1_u8(kShuffles[0])),
889
vtbl2q_u8(all_out, vld1_u8(kShuffles[1])),
890
vtbl2q_u8(all_out, vld1_u8(kShuffles[2])),
891
vtbl2q_u8(all_out, vld1_u8(kShuffles[3])));
892
#else
893
uint8x8x4_t all_out;
894
INIT_VECTOR4(all_out,
895
vreinterpret_u8_s16(vget_low_s16(out0)),
896
vreinterpret_u8_s16(vget_high_s16(out0)),
897
vreinterpret_u8_s16(vget_low_s16(out1)),
898
vreinterpret_u8_s16(vget_high_s16(out1)));
899
INIT_VECTOR4(shuffles,
900
vtbl4_u8(all_out, vld1_u8(kShuffles[0])),
901
vtbl4_u8(all_out, vld1_u8(kShuffles[1])),
902
vtbl4_u8(all_out, vld1_u8(kShuffles[2])),
903
vtbl4_u8(all_out, vld1_u8(kShuffles[3])));
904
#endif
905
// Zigzag reordering
906
vst1_u8((uint8_t*)(out + 0), shuffles.val[0]);
907
vst1_u8((uint8_t*)(out + 4), shuffles.val[1]);
908
vst1_u8((uint8_t*)(out + 8), shuffles.val[2]);
909
vst1_u8((uint8_t*)(out + 12), shuffles.val[3]);
910
// test zeros
911
if (*(uint64_t*)(out + 0) != 0) return 1;
912
if (*(uint64_t*)(out + 4) != 0) return 1;
913
if (*(uint64_t*)(out + 8) != 0) return 1;
914
if (*(uint64_t*)(out + 12) != 0) return 1;
915
return 0;
916
}
917
918
static int Quantize2Blocks_NEON(int16_t in[32], int16_t out[32],
919
const VP8Matrix* WEBP_RESTRICT const mtx) {
920
int nz;
921
nz = QuantizeBlock_NEON(in + 0 * 16, out + 0 * 16, mtx) << 0;
922
nz |= QuantizeBlock_NEON(in + 1 * 16, out + 1 * 16, mtx) << 1;
923
return nz;
924
}
925
926
#endif // !WORK_AROUND_GCC
927
928
#if WEBP_AARCH64
929
930
#if BPS == 32
931
#define DC4_VE4_HE4_TM4_NEON(dst, tbl, res, lane) \
932
do { \
933
uint8x16_t r; \
934
r = vqtbl2q_u8(qcombined, tbl); \
935
r = vreinterpretq_u8_u32( \
936
vsetq_lane_u32(vget_lane_u32(vreinterpret_u32_u8(res), lane), \
937
vreinterpretq_u32_u8(r), 1)); \
938
vst1q_u8(dst, r); \
939
} while (0)
940
941
#define RD4_VR4_LD4_VL4_NEON(dst, tbl) \
942
do { \
943
uint8x16_t r; \
944
r = vqtbl2q_u8(qcombined, tbl); \
945
vst1q_u8(dst, r); \
946
} while (0)
947
948
static WEBP_INLINE uint8x8x2_t Vld1U8x2(const uint8_t* ptr) {
949
#if LOCAL_CLANG_PREREQ(3, 4) || LOCAL_GCC_PREREQ(8, 5) || defined(_MSC_VER)
950
return vld1_u8_x2(ptr);
951
#else
952
uint8x8x2_t res;
953
INIT_VECTOR2(res, vld1_u8(ptr + 0 * 8), vld1_u8(ptr + 1 * 8));
954
return res;
955
#endif
956
}
957
958
static WEBP_INLINE uint8x16x4_t Vld1qU8x4(const uint8_t* ptr) {
959
#if LOCAL_CLANG_PREREQ(3, 4) || LOCAL_GCC_PREREQ(9, 4) || defined(_MSC_VER)
960
return vld1q_u8_x4(ptr);
961
#else
962
uint8x16x4_t res;
963
INIT_VECTOR4(res,
964
vld1q_u8(ptr + 0 * 16), vld1q_u8(ptr + 1 * 16),
965
vld1q_u8(ptr + 2 * 16), vld1q_u8(ptr + 3 * 16));
966
return res;
967
#endif
968
}
969
970
static void Intra4Preds_NEON(uint8_t* WEBP_RESTRICT dst,
971
const uint8_t* WEBP_RESTRICT top) {
972
// 0 1 2 3 4 5 6 7 8 9 10 11 12 13
973
// L K J I X A B C D E F G H
974
// -5 -4 -3 -2 -1 0 1 2 3 4 5 6 7
975
static const uint8_t kLookupTbl1[64] = {
976
0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 12, 12,
977
3, 3, 3, 3, 2, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0,
978
4, 20, 21, 22, 3, 18, 2, 17, 3, 19, 4, 20, 2, 17, 1, 16,
979
2, 18, 3, 19, 1, 16, 31, 31, 1, 17, 2, 18, 31, 31, 31, 31
980
};
981
982
static const uint8_t kLookupTbl2[64] = {
983
20, 21, 22, 23, 5, 6, 7, 8, 22, 23, 24, 25, 6, 7, 8, 9,
984
19, 20, 21, 22, 20, 21, 22, 23, 23, 24, 25, 26, 22, 23, 24, 25,
985
18, 19, 20, 21, 19, 5, 6, 7, 24, 25, 26, 27, 7, 8, 9, 26,
986
17, 18, 19, 20, 18, 20, 21, 22, 25, 26, 27, 28, 23, 24, 25, 27
987
};
988
989
static const uint8_t kLookupTbl3[64] = {
990
30, 30, 30, 30, 0, 0, 0, 0, 21, 22, 23, 24, 19, 19, 19, 19,
991
30, 30, 30, 30, 0, 0, 0, 0, 21, 22, 23, 24, 18, 18, 18, 18,
992
30, 30, 30, 30, 0, 0, 0, 0, 21, 22, 23, 24, 17, 17, 17, 17,
993
30, 30, 30, 30, 0, 0, 0, 0, 21, 22, 23, 24, 16, 16, 16, 16
994
};
995
996
const uint8x16x4_t lookup_avgs1 = Vld1qU8x4(kLookupTbl1);
997
const uint8x16x4_t lookup_avgs2 = Vld1qU8x4(kLookupTbl2);
998
const uint8x16x4_t lookup_avgs3 = Vld1qU8x4(kLookupTbl3);
999
1000
const uint8x16_t preload = vld1q_u8(top - 5);
1001
uint8x16x2_t qcombined;
1002
uint8x16_t result0, result1;
1003
1004
uint8x16_t a = vqtbl1q_u8(preload, lookup_avgs1.val[0]);
1005
uint8x16_t b = preload;
1006
uint8x16_t c = vextq_u8(a, a, 2);
1007
1008
uint8x16_t avg3_all = vrhaddq_u8(vhaddq_u8(a, c), b);
1009
uint8x16_t avg2_all = vrhaddq_u8(a, b);
1010
1011
uint8x8_t preload_x8, sub_a, sub_c;
1012
uint8_t result_u8;
1013
uint8x8_t res_lo, res_hi;
1014
uint8x16_t full_b;
1015
uint16x8_t sub, sum_lo, sum_hi;
1016
1017
preload_x8 = vget_low_u8(c);
1018
preload_x8 = vset_lane_u8(vgetq_lane_u8(preload, 0), preload_x8, 3);
1019
1020
result_u8 = (vaddlv_u8(preload_x8) + 4) >> 3;
1021
1022
avg3_all = vsetq_lane_u8(vgetq_lane_u8(preload, 0), avg3_all, 15);
1023
avg3_all = vsetq_lane_u8(result_u8, avg3_all, 14);
1024
1025
qcombined.val[0] = avg2_all;
1026
qcombined.val[1] = avg3_all;
1027
1028
sub_a = vdup_laneq_u8(preload, 4);
1029
1030
// preload = {a,b,c,d,...} => full_b = {d,d,d,d,c,c,c,c,b,b,b,b,a,a,a,a}
1031
full_b = vqtbl1q_u8(preload, lookup_avgs1.val[1]);
1032
// preload = {a,b,c,d,...} => sub_c = {a,b,c,d,a,b,c,d,a,b,c,d,a,b,c,d}
1033
sub_c = vreinterpret_u8_u32(vdup_n_u32(
1034
vgetq_lane_u32(vreinterpretq_u32_u8(vextq_u8(preload, preload, 5)), 0)));
1035
1036
sub = vsubl_u8(sub_c, sub_a);
1037
sum_lo = vaddw_u8(sub, vget_low_u8(full_b));
1038
res_lo = vqmovun_s16(vreinterpretq_s16_u16(sum_lo));
1039
1040
sum_hi = vaddw_u8(sub, vget_high_u8(full_b));
1041
res_hi = vqmovun_s16(vreinterpretq_s16_u16(sum_hi));
1042
1043
// DC4, VE4, HE4, TM4
1044
DC4_VE4_HE4_TM4_NEON(dst + I4DC4 + BPS * 0, lookup_avgs3.val[0], res_lo, 0);
1045
DC4_VE4_HE4_TM4_NEON(dst + I4DC4 + BPS * 1, lookup_avgs3.val[1], res_lo, 1);
1046
DC4_VE4_HE4_TM4_NEON(dst + I4DC4 + BPS * 2, lookup_avgs3.val[2], res_hi, 0);
1047
DC4_VE4_HE4_TM4_NEON(dst + I4DC4 + BPS * 3, lookup_avgs3.val[3], res_hi, 1);
1048
1049
// RD4, VR4, LD4, VL4
1050
RD4_VR4_LD4_VL4_NEON(dst + I4RD4 + BPS * 0, lookup_avgs2.val[0]);
1051
RD4_VR4_LD4_VL4_NEON(dst + I4RD4 + BPS * 1, lookup_avgs2.val[1]);
1052
RD4_VR4_LD4_VL4_NEON(dst + I4RD4 + BPS * 2, lookup_avgs2.val[2]);
1053
RD4_VR4_LD4_VL4_NEON(dst + I4RD4 + BPS * 3, lookup_avgs2.val[3]);
1054
1055
// HD4, HU4
1056
result0 = vqtbl2q_u8(qcombined, lookup_avgs1.val[2]);
1057
result1 = vqtbl2q_u8(qcombined, lookup_avgs1.val[3]);
1058
1059
vst1_u8(dst + I4HD4 + BPS * 0, vget_low_u8(result0));
1060
vst1_u8(dst + I4HD4 + BPS * 1, vget_high_u8(result0));
1061
vst1_u8(dst + I4HD4 + BPS * 2, vget_low_u8(result1));
1062
vst1_u8(dst + I4HD4 + BPS * 3, vget_high_u8(result1));
1063
}
1064
#endif // BPS == 32
1065
1066
static WEBP_INLINE void Fill_NEON(uint8_t* dst, const uint8_t value) {
1067
uint8x16_t a = vdupq_n_u8(value);
1068
int i;
1069
for (i = 0; i < 16; i++) {
1070
vst1q_u8(dst + BPS * i, a);
1071
}
1072
}
1073
1074
static WEBP_INLINE void Fill16_NEON(uint8_t* dst, const uint8_t* src) {
1075
uint8x16_t a = vld1q_u8(src);
1076
int i;
1077
for (i = 0; i < 16; i++) {
1078
vst1q_u8(dst + BPS * i, a);
1079
}
1080
}
1081
1082
static WEBP_INLINE void HorizontalPred16_NEON(uint8_t* dst,
1083
const uint8_t* left) {
1084
uint8x16_t a;
1085
1086
if (left == NULL) {
1087
Fill_NEON(dst, 129);
1088
return;
1089
}
1090
1091
a = vld1q_u8(left + 0);
1092
vst1q_u8(dst + BPS * 0, vdupq_laneq_u8(a, 0));
1093
vst1q_u8(dst + BPS * 1, vdupq_laneq_u8(a, 1));
1094
vst1q_u8(dst + BPS * 2, vdupq_laneq_u8(a, 2));
1095
vst1q_u8(dst + BPS * 3, vdupq_laneq_u8(a, 3));
1096
vst1q_u8(dst + BPS * 4, vdupq_laneq_u8(a, 4));
1097
vst1q_u8(dst + BPS * 5, vdupq_laneq_u8(a, 5));
1098
vst1q_u8(dst + BPS * 6, vdupq_laneq_u8(a, 6));
1099
vst1q_u8(dst + BPS * 7, vdupq_laneq_u8(a, 7));
1100
vst1q_u8(dst + BPS * 8, vdupq_laneq_u8(a, 8));
1101
vst1q_u8(dst + BPS * 9, vdupq_laneq_u8(a, 9));
1102
vst1q_u8(dst + BPS * 10, vdupq_laneq_u8(a, 10));
1103
vst1q_u8(dst + BPS * 11, vdupq_laneq_u8(a, 11));
1104
vst1q_u8(dst + BPS * 12, vdupq_laneq_u8(a, 12));
1105
vst1q_u8(dst + BPS * 13, vdupq_laneq_u8(a, 13));
1106
vst1q_u8(dst + BPS * 14, vdupq_laneq_u8(a, 14));
1107
vst1q_u8(dst + BPS * 15, vdupq_laneq_u8(a, 15));
1108
}
1109
1110
static WEBP_INLINE void VerticalPred16_NEON(uint8_t* dst, const uint8_t* top) {
1111
if (top != NULL) {
1112
Fill16_NEON(dst, top);
1113
} else {
1114
Fill_NEON(dst, 127);
1115
}
1116
}
1117
1118
static WEBP_INLINE void DCMode_NEON(uint8_t* dst, const uint8_t* left,
1119
const uint8_t* top) {
1120
uint8_t s;
1121
1122
if (top != NULL) {
1123
uint16_t dc;
1124
dc = vaddlvq_u8(vld1q_u8(top));
1125
if (left != NULL) {
1126
// top and left present.
1127
dc += vaddlvq_u8(vld1q_u8(left));
1128
s = vqrshrnh_n_u16(dc, 5);
1129
} else {
1130
// top but no left.
1131
s = vqrshrnh_n_u16(dc, 4);
1132
}
1133
} else {
1134
if (left != NULL) {
1135
uint16_t dc;
1136
// left but no top.
1137
dc = vaddlvq_u8(vld1q_u8(left));
1138
s = vqrshrnh_n_u16(dc, 4);
1139
} else {
1140
// No top, no left, nothing.
1141
s = 0x80;
1142
}
1143
}
1144
Fill_NEON(dst, s);
1145
}
1146
1147
static WEBP_INLINE void TrueMotionHelper_NEON(uint8_t* dst,
1148
const uint8x8_t outer,
1149
const uint8x8x2_t inner,
1150
const uint16x8_t a, int i,
1151
const int n) {
1152
uint8x8_t d1, d2;
1153
uint16x8_t r1, r2;
1154
1155
r1 = vaddl_u8(outer, inner.val[0]);
1156
r1 = vqsubq_u16(r1, a);
1157
d1 = vqmovun_s16(vreinterpretq_s16_u16(r1));
1158
r2 = vaddl_u8(outer, inner.val[1]);
1159
r2 = vqsubq_u16(r2, a);
1160
d2 = vqmovun_s16(vreinterpretq_s16_u16(r2));
1161
vst1_u8(dst + BPS * (i * 4 + n), d1);
1162
vst1_u8(dst + BPS * (i * 4 + n) + 8, d2);
1163
}
1164
1165
static WEBP_INLINE void TrueMotion_NEON(uint8_t* dst, const uint8_t* left,
1166
const uint8_t* top) {
1167
int i;
1168
uint16x8_t a;
1169
uint8x8x2_t inner;
1170
1171
if (left == NULL) {
1172
// True motion without left samples (hence: with default 129 value) is
1173
// equivalent to VE prediction where you just copy the top samples.
1174
// Note that if top samples are not available, the default value is then
1175
// 129, and not 127 as in the VerticalPred case.
1176
if (top != NULL) {
1177
VerticalPred16_NEON(dst, top);
1178
} else {
1179
Fill_NEON(dst, 129);
1180
}
1181
return;
1182
}
1183
1184
// left is not NULL.
1185
if (top == NULL) {
1186
HorizontalPred16_NEON(dst, left);
1187
return;
1188
}
1189
1190
// Neither left nor top are NULL.
1191
a = vdupq_n_u16(left[-1]);
1192
inner = Vld1U8x2(top);
1193
1194
for (i = 0; i < 4; i++) {
1195
const uint8x8x4_t outer = vld4_dup_u8(&left[i * 4]);
1196
1197
TrueMotionHelper_NEON(dst, outer.val[0], inner, a, i, 0);
1198
TrueMotionHelper_NEON(dst, outer.val[1], inner, a, i, 1);
1199
TrueMotionHelper_NEON(dst, outer.val[2], inner, a, i, 2);
1200
TrueMotionHelper_NEON(dst, outer.val[3], inner, a, i, 3);
1201
}
1202
}
1203
1204
static void Intra16Preds_NEON(uint8_t* WEBP_RESTRICT dst,
1205
const uint8_t* WEBP_RESTRICT left,
1206
const uint8_t* WEBP_RESTRICT top) {
1207
DCMode_NEON(I16DC16 + dst, left, top);
1208
VerticalPred16_NEON(I16VE16 + dst, top);
1209
HorizontalPred16_NEON(I16HE16 + dst, left);
1210
TrueMotion_NEON(I16TM16 + dst, left, top);
1211
}
1212
1213
#endif // WEBP_AARCH64
1214
1215
//------------------------------------------------------------------------------
1216
// Entry point
1217
1218
extern void VP8EncDspInitNEON(void);
1219
1220
WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspInitNEON(void) {
1221
VP8ITransform = ITransform_NEON;
1222
VP8FTransform = FTransform_NEON;
1223
1224
VP8FTransformWHT = FTransformWHT_NEON;
1225
1226
VP8TDisto4x4 = Disto4x4_NEON;
1227
VP8TDisto16x16 = Disto16x16_NEON;
1228
VP8CollectHistogram = CollectHistogram_NEON;
1229
1230
VP8SSE16x16 = SSE16x16_NEON;
1231
VP8SSE16x8 = SSE16x8_NEON;
1232
VP8SSE8x8 = SSE8x8_NEON;
1233
VP8SSE4x4 = SSE4x4_NEON;
1234
1235
#if WEBP_AARCH64
1236
#if BPS == 32
1237
VP8EncPredLuma4 = Intra4Preds_NEON;
1238
#endif
1239
VP8EncPredLuma16 = Intra16Preds_NEON;
1240
#endif
1241
1242
#if !defined(WORK_AROUND_GCC)
1243
VP8EncQuantizeBlock = QuantizeBlock_NEON;
1244
VP8EncQuantize2Blocks = Quantize2Blocks_NEON;
1245
VP8EncQuantizeBlockWHT = QuantizeBlock_NEON;
1246
#endif
1247
}
1248
1249
#else // !WEBP_USE_NEON
1250
1251
WEBP_DSP_INIT_STUB(VP8EncDspInitNEON)
1252
1253
#endif // WEBP_USE_NEON
1254
1255