Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
Tetragramm
GitHub Repository: Tetragramm/opencv
Path: blob/master/3rdparty/libwebp/src/dsp/enc_neon.c
16348 views
1
// Copyright 2012 Google Inc. All Rights Reserved.
2
//
3
// Use of this source code is governed by a BSD-style license
4
// that can be found in the COPYING file in the root of the source
5
// tree. An additional intellectual property rights grant can be found
6
// in the file PATENTS. All contributing project authors may
7
// be found in the AUTHORS file in the root of the source tree.
8
// -----------------------------------------------------------------------------
9
//
10
// ARM NEON version of speed-critical encoding functions.
11
//
12
// adapted from libvpx (http://www.webmproject.org/code/)
13
14
#include "src/dsp/dsp.h"
15
16
#if defined(WEBP_USE_NEON)
17
18
#include <assert.h>
19
20
#include "src/dsp/neon.h"
21
#include "src/enc/vp8i_enc.h"
22
23
//------------------------------------------------------------------------------
24
// Transforms (Paragraph 14.4)
25
26
// Inverse transform.
27
// This code is pretty much the same as TransformOne in the dec_neon.c, except
28
// for subtraction to *ref. See the comments there for algorithmic explanations.
29
30
static const int16_t kC1 = 20091;
31
static const int16_t kC2 = 17734; // half of kC2, actually. See comment above.
32
33
// This code works but is *slower* than the inlined-asm version below
34
// (with gcc-4.6). So we disable it for now. Later, it'll be conditional to
35
// WEBP_USE_INTRINSICS define.
36
// With gcc-4.8, it's a little faster speed than inlined-assembly.
37
#if defined(WEBP_USE_INTRINSICS)
38
39
// Treats 'v' as an uint8x8_t and zero extends to an int16x8_t.
40
static WEBP_INLINE int16x8_t ConvertU8ToS16_NEON(uint32x2_t v) {
41
return vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_u32(v)));
42
}
43
44
// Performs unsigned 8b saturation on 'dst01' and 'dst23' storing the result
45
// to the corresponding rows of 'dst'.
46
static WEBP_INLINE void SaturateAndStore4x4_NEON(uint8_t* const dst,
47
const int16x8_t dst01,
48
const int16x8_t dst23) {
49
// Unsigned saturate to 8b.
50
const uint8x8_t dst01_u8 = vqmovun_s16(dst01);
51
const uint8x8_t dst23_u8 = vqmovun_s16(dst23);
52
53
// Store the results.
54
vst1_lane_u32((uint32_t*)(dst + 0 * BPS), vreinterpret_u32_u8(dst01_u8), 0);
55
vst1_lane_u32((uint32_t*)(dst + 1 * BPS), vreinterpret_u32_u8(dst01_u8), 1);
56
vst1_lane_u32((uint32_t*)(dst + 2 * BPS), vreinterpret_u32_u8(dst23_u8), 0);
57
vst1_lane_u32((uint32_t*)(dst + 3 * BPS), vreinterpret_u32_u8(dst23_u8), 1);
58
}
59
60
static WEBP_INLINE void Add4x4_NEON(const int16x8_t row01,
61
const int16x8_t row23,
62
const uint8_t* const ref,
63
uint8_t* const dst) {
64
uint32x2_t dst01 = vdup_n_u32(0);
65
uint32x2_t dst23 = vdup_n_u32(0);
66
67
// Load the source pixels.
68
dst01 = vld1_lane_u32((uint32_t*)(ref + 0 * BPS), dst01, 0);
69
dst23 = vld1_lane_u32((uint32_t*)(ref + 2 * BPS), dst23, 0);
70
dst01 = vld1_lane_u32((uint32_t*)(ref + 1 * BPS), dst01, 1);
71
dst23 = vld1_lane_u32((uint32_t*)(ref + 3 * BPS), dst23, 1);
72
73
{
74
// Convert to 16b.
75
const int16x8_t dst01_s16 = ConvertU8ToS16_NEON(dst01);
76
const int16x8_t dst23_s16 = ConvertU8ToS16_NEON(dst23);
77
78
// Descale with rounding.
79
const int16x8_t out01 = vrsraq_n_s16(dst01_s16, row01, 3);
80
const int16x8_t out23 = vrsraq_n_s16(dst23_s16, row23, 3);
81
// Add the inverse transform.
82
SaturateAndStore4x4_NEON(dst, out01, out23);
83
}
84
}
85
86
static WEBP_INLINE void Transpose8x2_NEON(const int16x8_t in0,
87
const int16x8_t in1,
88
int16x8x2_t* const out) {
89
// a0 a1 a2 a3 | b0 b1 b2 b3 => a0 b0 c0 d0 | a1 b1 c1 d1
90
// c0 c1 c2 c3 | d0 d1 d2 d3 a2 b2 c2 d2 | a3 b3 c3 d3
91
const int16x8x2_t tmp0 = vzipq_s16(in0, in1); // a0 c0 a1 c1 a2 c2 ...
92
// b0 d0 b1 d1 b2 d2 ...
93
*out = vzipq_s16(tmp0.val[0], tmp0.val[1]);
94
}
95
96
static WEBP_INLINE void TransformPass_NEON(int16x8x2_t* const rows) {
97
// {rows} = in0 | in4
98
// in8 | in12
99
// B1 = in4 | in12
100
const int16x8_t B1 =
101
vcombine_s16(vget_high_s16(rows->val[0]), vget_high_s16(rows->val[1]));
102
// C0 = kC1 * in4 | kC1 * in12
103
// C1 = kC2 * in4 | kC2 * in12
104
const int16x8_t C0 = vsraq_n_s16(B1, vqdmulhq_n_s16(B1, kC1), 1);
105
const int16x8_t C1 = vqdmulhq_n_s16(B1, kC2);
106
const int16x4_t a = vqadd_s16(vget_low_s16(rows->val[0]),
107
vget_low_s16(rows->val[1])); // in0 + in8
108
const int16x4_t b = vqsub_s16(vget_low_s16(rows->val[0]),
109
vget_low_s16(rows->val[1])); // in0 - in8
110
// c = kC2 * in4 - kC1 * in12
111
// d = kC1 * in4 + kC2 * in12
112
const int16x4_t c = vqsub_s16(vget_low_s16(C1), vget_high_s16(C0));
113
const int16x4_t d = vqadd_s16(vget_low_s16(C0), vget_high_s16(C1));
114
const int16x8_t D0 = vcombine_s16(a, b); // D0 = a | b
115
const int16x8_t D1 = vcombine_s16(d, c); // D1 = d | c
116
const int16x8_t E0 = vqaddq_s16(D0, D1); // a+d | b+c
117
const int16x8_t E_tmp = vqsubq_s16(D0, D1); // a-d | b-c
118
const int16x8_t E1 = vcombine_s16(vget_high_s16(E_tmp), vget_low_s16(E_tmp));
119
Transpose8x2_NEON(E0, E1, rows);
120
}
121
122
static void ITransformOne_NEON(const uint8_t* ref,
123
const int16_t* in, uint8_t* dst) {
124
int16x8x2_t rows;
125
INIT_VECTOR2(rows, vld1q_s16(in + 0), vld1q_s16(in + 8));
126
TransformPass_NEON(&rows);
127
TransformPass_NEON(&rows);
128
Add4x4_NEON(rows.val[0], rows.val[1], ref, dst);
129
}
130
131
#else
132
133
static void ITransformOne_NEON(const uint8_t* ref,
134
const int16_t* in, uint8_t* dst) {
135
const int kBPS = BPS;
136
const int16_t kC1C2[] = { kC1, kC2, 0, 0 };
137
138
__asm__ volatile (
139
"vld1.16 {q1, q2}, [%[in]] \n"
140
"vld1.16 {d0}, [%[kC1C2]] \n"
141
142
// d2: in[0]
143
// d3: in[8]
144
// d4: in[4]
145
// d5: in[12]
146
"vswp d3, d4 \n"
147
148
// q8 = {in[4], in[12]} * kC1 * 2 >> 16
149
// q9 = {in[4], in[12]} * kC2 >> 16
150
"vqdmulh.s16 q8, q2, d0[0] \n"
151
"vqdmulh.s16 q9, q2, d0[1] \n"
152
153
// d22 = a = in[0] + in[8]
154
// d23 = b = in[0] - in[8]
155
"vqadd.s16 d22, d2, d3 \n"
156
"vqsub.s16 d23, d2, d3 \n"
157
158
// q8 = in[4]/[12] * kC1 >> 16
159
"vshr.s16 q8, q8, #1 \n"
160
161
// Add {in[4], in[12]} back after the multiplication.
162
"vqadd.s16 q8, q2, q8 \n"
163
164
// d20 = c = in[4]*kC2 - in[12]*kC1
165
// d21 = d = in[4]*kC1 + in[12]*kC2
166
"vqsub.s16 d20, d18, d17 \n"
167
"vqadd.s16 d21, d19, d16 \n"
168
169
// d2 = tmp[0] = a + d
170
// d3 = tmp[1] = b + c
171
// d4 = tmp[2] = b - c
172
// d5 = tmp[3] = a - d
173
"vqadd.s16 d2, d22, d21 \n"
174
"vqadd.s16 d3, d23, d20 \n"
175
"vqsub.s16 d4, d23, d20 \n"
176
"vqsub.s16 d5, d22, d21 \n"
177
178
"vzip.16 q1, q2 \n"
179
"vzip.16 q1, q2 \n"
180
181
"vswp d3, d4 \n"
182
183
// q8 = {tmp[4], tmp[12]} * kC1 * 2 >> 16
184
// q9 = {tmp[4], tmp[12]} * kC2 >> 16
185
"vqdmulh.s16 q8, q2, d0[0] \n"
186
"vqdmulh.s16 q9, q2, d0[1] \n"
187
188
// d22 = a = tmp[0] + tmp[8]
189
// d23 = b = tmp[0] - tmp[8]
190
"vqadd.s16 d22, d2, d3 \n"
191
"vqsub.s16 d23, d2, d3 \n"
192
193
"vshr.s16 q8, q8, #1 \n"
194
"vqadd.s16 q8, q2, q8 \n"
195
196
// d20 = c = in[4]*kC2 - in[12]*kC1
197
// d21 = d = in[4]*kC1 + in[12]*kC2
198
"vqsub.s16 d20, d18, d17 \n"
199
"vqadd.s16 d21, d19, d16 \n"
200
201
// d2 = tmp[0] = a + d
202
// d3 = tmp[1] = b + c
203
// d4 = tmp[2] = b - c
204
// d5 = tmp[3] = a - d
205
"vqadd.s16 d2, d22, d21 \n"
206
"vqadd.s16 d3, d23, d20 \n"
207
"vqsub.s16 d4, d23, d20 \n"
208
"vqsub.s16 d5, d22, d21 \n"
209
210
"vld1.32 d6[0], [%[ref]], %[kBPS] \n"
211
"vld1.32 d6[1], [%[ref]], %[kBPS] \n"
212
"vld1.32 d7[0], [%[ref]], %[kBPS] \n"
213
"vld1.32 d7[1], [%[ref]], %[kBPS] \n"
214
215
"sub %[ref], %[ref], %[kBPS], lsl #2 \n"
216
217
// (val) + 4 >> 3
218
"vrshr.s16 d2, d2, #3 \n"
219
"vrshr.s16 d3, d3, #3 \n"
220
"vrshr.s16 d4, d4, #3 \n"
221
"vrshr.s16 d5, d5, #3 \n"
222
223
"vzip.16 q1, q2 \n"
224
"vzip.16 q1, q2 \n"
225
226
// Must accumulate before saturating
227
"vmovl.u8 q8, d6 \n"
228
"vmovl.u8 q9, d7 \n"
229
230
"vqadd.s16 q1, q1, q8 \n"
231
"vqadd.s16 q2, q2, q9 \n"
232
233
"vqmovun.s16 d0, q1 \n"
234
"vqmovun.s16 d1, q2 \n"
235
236
"vst1.32 d0[0], [%[dst]], %[kBPS] \n"
237
"vst1.32 d0[1], [%[dst]], %[kBPS] \n"
238
"vst1.32 d1[0], [%[dst]], %[kBPS] \n"
239
"vst1.32 d1[1], [%[dst]] \n"
240
241
: [in] "+r"(in), [dst] "+r"(dst) // modified registers
242
: [kBPS] "r"(kBPS), [kC1C2] "r"(kC1C2), [ref] "r"(ref) // constants
243
: "memory", "q0", "q1", "q2", "q8", "q9", "q10", "q11" // clobbered
244
);
245
}
246
247
#endif // WEBP_USE_INTRINSICS
248
249
static void ITransform_NEON(const uint8_t* ref,
250
const int16_t* in, uint8_t* dst, int do_two) {
251
ITransformOne_NEON(ref, in, dst);
252
if (do_two) {
253
ITransformOne_NEON(ref + 4, in + 16, dst + 4);
254
}
255
}
256
257
// Load all 4x4 pixels into a single uint8x16_t variable.
258
static uint8x16_t Load4x4_NEON(const uint8_t* src) {
259
uint32x4_t out = vdupq_n_u32(0);
260
out = vld1q_lane_u32((const uint32_t*)(src + 0 * BPS), out, 0);
261
out = vld1q_lane_u32((const uint32_t*)(src + 1 * BPS), out, 1);
262
out = vld1q_lane_u32((const uint32_t*)(src + 2 * BPS), out, 2);
263
out = vld1q_lane_u32((const uint32_t*)(src + 3 * BPS), out, 3);
264
return vreinterpretq_u8_u32(out);
265
}
266
267
// Forward transform.
268
269
#if defined(WEBP_USE_INTRINSICS)
270
271
static WEBP_INLINE void Transpose4x4_S16_NEON(const int16x4_t A,
272
const int16x4_t B,
273
const int16x4_t C,
274
const int16x4_t D,
275
int16x8_t* const out01,
276
int16x8_t* const out32) {
277
const int16x4x2_t AB = vtrn_s16(A, B);
278
const int16x4x2_t CD = vtrn_s16(C, D);
279
const int32x2x2_t tmp02 = vtrn_s32(vreinterpret_s32_s16(AB.val[0]),
280
vreinterpret_s32_s16(CD.val[0]));
281
const int32x2x2_t tmp13 = vtrn_s32(vreinterpret_s32_s16(AB.val[1]),
282
vreinterpret_s32_s16(CD.val[1]));
283
*out01 = vreinterpretq_s16_s64(
284
vcombine_s64(vreinterpret_s64_s32(tmp02.val[0]),
285
vreinterpret_s64_s32(tmp13.val[0])));
286
*out32 = vreinterpretq_s16_s64(
287
vcombine_s64(vreinterpret_s64_s32(tmp13.val[1]),
288
vreinterpret_s64_s32(tmp02.val[1])));
289
}
290
291
static WEBP_INLINE int16x8_t DiffU8ToS16_NEON(const uint8x8_t a,
292
const uint8x8_t b) {
293
return vreinterpretq_s16_u16(vsubl_u8(a, b));
294
}
295
296
static void FTransform_NEON(const uint8_t* src, const uint8_t* ref,
297
int16_t* out) {
298
int16x8_t d0d1, d3d2; // working 4x4 int16 variables
299
{
300
const uint8x16_t S0 = Load4x4_NEON(src);
301
const uint8x16_t R0 = Load4x4_NEON(ref);
302
const int16x8_t D0D1 = DiffU8ToS16_NEON(vget_low_u8(S0), vget_low_u8(R0));
303
const int16x8_t D2D3 = DiffU8ToS16_NEON(vget_high_u8(S0), vget_high_u8(R0));
304
const int16x4_t D0 = vget_low_s16(D0D1);
305
const int16x4_t D1 = vget_high_s16(D0D1);
306
const int16x4_t D2 = vget_low_s16(D2D3);
307
const int16x4_t D3 = vget_high_s16(D2D3);
308
Transpose4x4_S16_NEON(D0, D1, D2, D3, &d0d1, &d3d2);
309
}
310
{ // 1rst pass
311
const int32x4_t kCst937 = vdupq_n_s32(937);
312
const int32x4_t kCst1812 = vdupq_n_s32(1812);
313
const int16x8_t a0a1 = vaddq_s16(d0d1, d3d2); // d0+d3 | d1+d2 (=a0|a1)
314
const int16x8_t a3a2 = vsubq_s16(d0d1, d3d2); // d0-d3 | d1-d2 (=a3|a2)
315
const int16x8_t a0a1_2 = vshlq_n_s16(a0a1, 3);
316
const int16x4_t tmp0 = vadd_s16(vget_low_s16(a0a1_2),
317
vget_high_s16(a0a1_2));
318
const int16x4_t tmp2 = vsub_s16(vget_low_s16(a0a1_2),
319
vget_high_s16(a0a1_2));
320
const int32x4_t a3_2217 = vmull_n_s16(vget_low_s16(a3a2), 2217);
321
const int32x4_t a2_2217 = vmull_n_s16(vget_high_s16(a3a2), 2217);
322
const int32x4_t a2_p_a3 = vmlal_n_s16(a2_2217, vget_low_s16(a3a2), 5352);
323
const int32x4_t a3_m_a2 = vmlsl_n_s16(a3_2217, vget_high_s16(a3a2), 5352);
324
const int16x4_t tmp1 = vshrn_n_s32(vaddq_s32(a2_p_a3, kCst1812), 9);
325
const int16x4_t tmp3 = vshrn_n_s32(vaddq_s32(a3_m_a2, kCst937), 9);
326
Transpose4x4_S16_NEON(tmp0, tmp1, tmp2, tmp3, &d0d1, &d3d2);
327
}
328
{ // 2nd pass
329
// the (1<<16) addition is for the replacement: a3!=0 <-> 1-(a3==0)
330
const int32x4_t kCst12000 = vdupq_n_s32(12000 + (1 << 16));
331
const int32x4_t kCst51000 = vdupq_n_s32(51000);
332
const int16x8_t a0a1 = vaddq_s16(d0d1, d3d2); // d0+d3 | d1+d2 (=a0|a1)
333
const int16x8_t a3a2 = vsubq_s16(d0d1, d3d2); // d0-d3 | d1-d2 (=a3|a2)
334
const int16x4_t a0_k7 = vadd_s16(vget_low_s16(a0a1), vdup_n_s16(7));
335
const int16x4_t out0 = vshr_n_s16(vadd_s16(a0_k7, vget_high_s16(a0a1)), 4);
336
const int16x4_t out2 = vshr_n_s16(vsub_s16(a0_k7, vget_high_s16(a0a1)), 4);
337
const int32x4_t a3_2217 = vmull_n_s16(vget_low_s16(a3a2), 2217);
338
const int32x4_t a2_2217 = vmull_n_s16(vget_high_s16(a3a2), 2217);
339
const int32x4_t a2_p_a3 = vmlal_n_s16(a2_2217, vget_low_s16(a3a2), 5352);
340
const int32x4_t a3_m_a2 = vmlsl_n_s16(a3_2217, vget_high_s16(a3a2), 5352);
341
const int16x4_t tmp1 = vaddhn_s32(a2_p_a3, kCst12000);
342
const int16x4_t out3 = vaddhn_s32(a3_m_a2, kCst51000);
343
const int16x4_t a3_eq_0 =
344
vreinterpret_s16_u16(vceq_s16(vget_low_s16(a3a2), vdup_n_s16(0)));
345
const int16x4_t out1 = vadd_s16(tmp1, a3_eq_0);
346
vst1_s16(out + 0, out0);
347
vst1_s16(out + 4, out1);
348
vst1_s16(out + 8, out2);
349
vst1_s16(out + 12, out3);
350
}
351
}
352
353
#else
354
355
// adapted from vp8/encoder/arm/neon/shortfdct_neon.asm
356
static const int16_t kCoeff16[] = {
357
5352, 5352, 5352, 5352, 2217, 2217, 2217, 2217
358
};
359
static const int32_t kCoeff32[] = {
360
1812, 1812, 1812, 1812,
361
937, 937, 937, 937,
362
12000, 12000, 12000, 12000,
363
51000, 51000, 51000, 51000
364
};
365
366
static void FTransform_NEON(const uint8_t* src, const uint8_t* ref,
367
int16_t* out) {
368
const int kBPS = BPS;
369
const uint8_t* src_ptr = src;
370
const uint8_t* ref_ptr = ref;
371
const int16_t* coeff16 = kCoeff16;
372
const int32_t* coeff32 = kCoeff32;
373
374
__asm__ volatile (
375
// load src into q4, q5 in high half
376
"vld1.8 {d8}, [%[src_ptr]], %[kBPS] \n"
377
"vld1.8 {d10}, [%[src_ptr]], %[kBPS] \n"
378
"vld1.8 {d9}, [%[src_ptr]], %[kBPS] \n"
379
"vld1.8 {d11}, [%[src_ptr]] \n"
380
381
// load ref into q6, q7 in high half
382
"vld1.8 {d12}, [%[ref_ptr]], %[kBPS] \n"
383
"vld1.8 {d14}, [%[ref_ptr]], %[kBPS] \n"
384
"vld1.8 {d13}, [%[ref_ptr]], %[kBPS] \n"
385
"vld1.8 {d15}, [%[ref_ptr]] \n"
386
387
// Pack the high values in to q4 and q6
388
"vtrn.32 q4, q5 \n"
389
"vtrn.32 q6, q7 \n"
390
391
// d[0-3] = src - ref
392
"vsubl.u8 q0, d8, d12 \n"
393
"vsubl.u8 q1, d9, d13 \n"
394
395
// load coeff16 into q8(d16=5352, d17=2217)
396
"vld1.16 {q8}, [%[coeff16]] \n"
397
398
// load coeff32 high half into q9 = 1812, q10 = 937
399
"vld1.32 {q9, q10}, [%[coeff32]]! \n"
400
401
// load coeff32 low half into q11=12000, q12=51000
402
"vld1.32 {q11,q12}, [%[coeff32]] \n"
403
404
// part 1
405
// Transpose. Register dN is the same as dN in C
406
"vtrn.32 d0, d2 \n"
407
"vtrn.32 d1, d3 \n"
408
"vtrn.16 d0, d1 \n"
409
"vtrn.16 d2, d3 \n"
410
411
"vadd.s16 d4, d0, d3 \n" // a0 = d0 + d3
412
"vadd.s16 d5, d1, d2 \n" // a1 = d1 + d2
413
"vsub.s16 d6, d1, d2 \n" // a2 = d1 - d2
414
"vsub.s16 d7, d0, d3 \n" // a3 = d0 - d3
415
416
"vadd.s16 d0, d4, d5 \n" // a0 + a1
417
"vshl.s16 d0, d0, #3 \n" // temp[0+i*4] = (a0+a1) << 3
418
"vsub.s16 d2, d4, d5 \n" // a0 - a1
419
"vshl.s16 d2, d2, #3 \n" // (temp[2+i*4] = (a0-a1) << 3
420
421
"vmlal.s16 q9, d7, d16 \n" // a3*5352 + 1812
422
"vmlal.s16 q10, d7, d17 \n" // a3*2217 + 937
423
"vmlal.s16 q9, d6, d17 \n" // a2*2217 + a3*5352 + 1812
424
"vmlsl.s16 q10, d6, d16 \n" // a3*2217 + 937 - a2*5352
425
426
// temp[1+i*4] = (d2*2217 + d3*5352 + 1812) >> 9
427
// temp[3+i*4] = (d3*2217 + 937 - d2*5352) >> 9
428
"vshrn.s32 d1, q9, #9 \n"
429
"vshrn.s32 d3, q10, #9 \n"
430
431
// part 2
432
// transpose d0=ip[0], d1=ip[4], d2=ip[8], d3=ip[12]
433
"vtrn.32 d0, d2 \n"
434
"vtrn.32 d1, d3 \n"
435
"vtrn.16 d0, d1 \n"
436
"vtrn.16 d2, d3 \n"
437
438
"vmov.s16 d26, #7 \n"
439
440
"vadd.s16 d4, d0, d3 \n" // a1 = ip[0] + ip[12]
441
"vadd.s16 d5, d1, d2 \n" // b1 = ip[4] + ip[8]
442
"vsub.s16 d6, d1, d2 \n" // c1 = ip[4] - ip[8]
443
"vadd.s16 d4, d4, d26 \n" // a1 + 7
444
"vsub.s16 d7, d0, d3 \n" // d1 = ip[0] - ip[12]
445
446
"vadd.s16 d0, d4, d5 \n" // op[0] = a1 + b1 + 7
447
"vsub.s16 d2, d4, d5 \n" // op[8] = a1 - b1 + 7
448
449
"vmlal.s16 q11, d7, d16 \n" // d1*5352 + 12000
450
"vmlal.s16 q12, d7, d17 \n" // d1*2217 + 51000
451
452
"vceq.s16 d4, d7, #0 \n"
453
454
"vshr.s16 d0, d0, #4 \n"
455
"vshr.s16 d2, d2, #4 \n"
456
457
"vmlal.s16 q11, d6, d17 \n" // c1*2217 + d1*5352 + 12000
458
"vmlsl.s16 q12, d6, d16 \n" // d1*2217 - c1*5352 + 51000
459
460
"vmvn d4, d4 \n" // !(d1 == 0)
461
// op[4] = (c1*2217 + d1*5352 + 12000)>>16
462
"vshrn.s32 d1, q11, #16 \n"
463
// op[4] += (d1!=0)
464
"vsub.s16 d1, d1, d4 \n"
465
// op[12]= (d1*2217 - c1*5352 + 51000)>>16
466
"vshrn.s32 d3, q12, #16 \n"
467
468
// set result to out array
469
"vst1.16 {q0, q1}, [%[out]] \n"
470
: [src_ptr] "+r"(src_ptr), [ref_ptr] "+r"(ref_ptr),
471
[coeff32] "+r"(coeff32) // modified registers
472
: [kBPS] "r"(kBPS), [coeff16] "r"(coeff16),
473
[out] "r"(out) // constants
474
: "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9",
475
"q10", "q11", "q12", "q13" // clobbered
476
);
477
}
478
479
#endif
480
481
#define LOAD_LANE_16b(VALUE, LANE) do { \
482
(VALUE) = vld1_lane_s16(src, (VALUE), (LANE)); \
483
src += stride; \
484
} while (0)
485
486
static void FTransformWHT_NEON(const int16_t* src, int16_t* out) {
487
const int stride = 16;
488
const int16x4_t zero = vdup_n_s16(0);
489
int32x4x4_t tmp0;
490
int16x4x4_t in;
491
INIT_VECTOR4(in, zero, zero, zero, zero);
492
LOAD_LANE_16b(in.val[0], 0);
493
LOAD_LANE_16b(in.val[1], 0);
494
LOAD_LANE_16b(in.val[2], 0);
495
LOAD_LANE_16b(in.val[3], 0);
496
LOAD_LANE_16b(in.val[0], 1);
497
LOAD_LANE_16b(in.val[1], 1);
498
LOAD_LANE_16b(in.val[2], 1);
499
LOAD_LANE_16b(in.val[3], 1);
500
LOAD_LANE_16b(in.val[0], 2);
501
LOAD_LANE_16b(in.val[1], 2);
502
LOAD_LANE_16b(in.val[2], 2);
503
LOAD_LANE_16b(in.val[3], 2);
504
LOAD_LANE_16b(in.val[0], 3);
505
LOAD_LANE_16b(in.val[1], 3);
506
LOAD_LANE_16b(in.val[2], 3);
507
LOAD_LANE_16b(in.val[3], 3);
508
509
{
510
// a0 = in[0 * 16] + in[2 * 16]
511
// a1 = in[1 * 16] + in[3 * 16]
512
// a2 = in[1 * 16] - in[3 * 16]
513
// a3 = in[0 * 16] - in[2 * 16]
514
const int32x4_t a0 = vaddl_s16(in.val[0], in.val[2]);
515
const int32x4_t a1 = vaddl_s16(in.val[1], in.val[3]);
516
const int32x4_t a2 = vsubl_s16(in.val[1], in.val[3]);
517
const int32x4_t a3 = vsubl_s16(in.val[0], in.val[2]);
518
tmp0.val[0] = vaddq_s32(a0, a1);
519
tmp0.val[1] = vaddq_s32(a3, a2);
520
tmp0.val[2] = vsubq_s32(a3, a2);
521
tmp0.val[3] = vsubq_s32(a0, a1);
522
}
523
{
524
const int32x4x4_t tmp1 = Transpose4x4_NEON(tmp0);
525
// a0 = tmp[0 + i] + tmp[ 8 + i]
526
// a1 = tmp[4 + i] + tmp[12 + i]
527
// a2 = tmp[4 + i] - tmp[12 + i]
528
// a3 = tmp[0 + i] - tmp[ 8 + i]
529
const int32x4_t a0 = vaddq_s32(tmp1.val[0], tmp1.val[2]);
530
const int32x4_t a1 = vaddq_s32(tmp1.val[1], tmp1.val[3]);
531
const int32x4_t a2 = vsubq_s32(tmp1.val[1], tmp1.val[3]);
532
const int32x4_t a3 = vsubq_s32(tmp1.val[0], tmp1.val[2]);
533
const int32x4_t b0 = vhaddq_s32(a0, a1); // (a0 + a1) >> 1
534
const int32x4_t b1 = vhaddq_s32(a3, a2); // (a3 + a2) >> 1
535
const int32x4_t b2 = vhsubq_s32(a3, a2); // (a3 - a2) >> 1
536
const int32x4_t b3 = vhsubq_s32(a0, a1); // (a0 - a1) >> 1
537
const int16x4_t out0 = vmovn_s32(b0);
538
const int16x4_t out1 = vmovn_s32(b1);
539
const int16x4_t out2 = vmovn_s32(b2);
540
const int16x4_t out3 = vmovn_s32(b3);
541
542
vst1_s16(out + 0, out0);
543
vst1_s16(out + 4, out1);
544
vst1_s16(out + 8, out2);
545
vst1_s16(out + 12, out3);
546
}
547
}
548
#undef LOAD_LANE_16b
549
550
//------------------------------------------------------------------------------
551
// Texture distortion
552
//
553
// We try to match the spectral content (weighted) between source and
554
// reconstructed samples.
555
556
// a 0123, b 0123
557
// a 4567, b 4567
558
// a 89ab, b 89ab
559
// a cdef, b cdef
560
//
561
// transpose
562
//
563
// a 048c, b 048c
564
// a 159d, b 159d
565
// a 26ae, b 26ae
566
// a 37bf, b 37bf
567
//
568
static WEBP_INLINE int16x8x4_t DistoTranspose4x4S16_NEON(int16x8x4_t q4_in) {
569
const int16x8x2_t q2_tmp0 = vtrnq_s16(q4_in.val[0], q4_in.val[1]);
570
const int16x8x2_t q2_tmp1 = vtrnq_s16(q4_in.val[2], q4_in.val[3]);
571
const int32x4x2_t q2_tmp2 = vtrnq_s32(vreinterpretq_s32_s16(q2_tmp0.val[0]),
572
vreinterpretq_s32_s16(q2_tmp1.val[0]));
573
const int32x4x2_t q2_tmp3 = vtrnq_s32(vreinterpretq_s32_s16(q2_tmp0.val[1]),
574
vreinterpretq_s32_s16(q2_tmp1.val[1]));
575
q4_in.val[0] = vreinterpretq_s16_s32(q2_tmp2.val[0]);
576
q4_in.val[2] = vreinterpretq_s16_s32(q2_tmp2.val[1]);
577
q4_in.val[1] = vreinterpretq_s16_s32(q2_tmp3.val[0]);
578
q4_in.val[3] = vreinterpretq_s16_s32(q2_tmp3.val[1]);
579
return q4_in;
580
}
581
582
static WEBP_INLINE int16x8x4_t DistoHorizontalPass_NEON(
583
const int16x8x4_t q4_in) {
584
// {a0, a1} = {in[0] + in[2], in[1] + in[3]}
585
// {a3, a2} = {in[0] - in[2], in[1] - in[3]}
586
const int16x8_t q_a0 = vaddq_s16(q4_in.val[0], q4_in.val[2]);
587
const int16x8_t q_a1 = vaddq_s16(q4_in.val[1], q4_in.val[3]);
588
const int16x8_t q_a3 = vsubq_s16(q4_in.val[0], q4_in.val[2]);
589
const int16x8_t q_a2 = vsubq_s16(q4_in.val[1], q4_in.val[3]);
590
int16x8x4_t q4_out;
591
// tmp[0] = a0 + a1
592
// tmp[1] = a3 + a2
593
// tmp[2] = a3 - a2
594
// tmp[3] = a0 - a1
595
INIT_VECTOR4(q4_out,
596
vabsq_s16(vaddq_s16(q_a0, q_a1)),
597
vabsq_s16(vaddq_s16(q_a3, q_a2)),
598
vabdq_s16(q_a3, q_a2), vabdq_s16(q_a0, q_a1));
599
return q4_out;
600
}
601
602
static WEBP_INLINE int16x8x4_t DistoVerticalPass_NEON(const uint8x8x4_t q4_in) {
603
const int16x8_t q_a0 = vreinterpretq_s16_u16(vaddl_u8(q4_in.val[0],
604
q4_in.val[2]));
605
const int16x8_t q_a1 = vreinterpretq_s16_u16(vaddl_u8(q4_in.val[1],
606
q4_in.val[3]));
607
const int16x8_t q_a2 = vreinterpretq_s16_u16(vsubl_u8(q4_in.val[1],
608
q4_in.val[3]));
609
const int16x8_t q_a3 = vreinterpretq_s16_u16(vsubl_u8(q4_in.val[0],
610
q4_in.val[2]));
611
int16x8x4_t q4_out;
612
613
INIT_VECTOR4(q4_out,
614
vaddq_s16(q_a0, q_a1), vaddq_s16(q_a3, q_a2),
615
vsubq_s16(q_a3, q_a2), vsubq_s16(q_a0, q_a1));
616
return q4_out;
617
}
618
619
static WEBP_INLINE int16x4x4_t DistoLoadW_NEON(const uint16_t* w) {
620
const uint16x8_t q_w07 = vld1q_u16(&w[0]);
621
const uint16x8_t q_w8f = vld1q_u16(&w[8]);
622
int16x4x4_t d4_w;
623
INIT_VECTOR4(d4_w,
624
vget_low_s16(vreinterpretq_s16_u16(q_w07)),
625
vget_high_s16(vreinterpretq_s16_u16(q_w07)),
626
vget_low_s16(vreinterpretq_s16_u16(q_w8f)),
627
vget_high_s16(vreinterpretq_s16_u16(q_w8f)));
628
return d4_w;
629
}
630
631
static WEBP_INLINE int32x2_t DistoSum_NEON(const int16x8x4_t q4_in,
632
const int16x4x4_t d4_w) {
633
int32x2_t d_sum;
634
// sum += w[ 0] * abs(b0);
635
// sum += w[ 4] * abs(b1);
636
// sum += w[ 8] * abs(b2);
637
// sum += w[12] * abs(b3);
638
int32x4_t q_sum0 = vmull_s16(d4_w.val[0], vget_low_s16(q4_in.val[0]));
639
int32x4_t q_sum1 = vmull_s16(d4_w.val[1], vget_low_s16(q4_in.val[1]));
640
int32x4_t q_sum2 = vmull_s16(d4_w.val[2], vget_low_s16(q4_in.val[2]));
641
int32x4_t q_sum3 = vmull_s16(d4_w.val[3], vget_low_s16(q4_in.val[3]));
642
q_sum0 = vmlsl_s16(q_sum0, d4_w.val[0], vget_high_s16(q4_in.val[0]));
643
q_sum1 = vmlsl_s16(q_sum1, d4_w.val[1], vget_high_s16(q4_in.val[1]));
644
q_sum2 = vmlsl_s16(q_sum2, d4_w.val[2], vget_high_s16(q4_in.val[2]));
645
q_sum3 = vmlsl_s16(q_sum3, d4_w.val[3], vget_high_s16(q4_in.val[3]));
646
647
q_sum0 = vaddq_s32(q_sum0, q_sum1);
648
q_sum2 = vaddq_s32(q_sum2, q_sum3);
649
q_sum2 = vaddq_s32(q_sum0, q_sum2);
650
d_sum = vpadd_s32(vget_low_s32(q_sum2), vget_high_s32(q_sum2));
651
d_sum = vpadd_s32(d_sum, d_sum);
652
return d_sum;
653
}
654
655
#define LOAD_LANE_32b(src, VALUE, LANE) \
656
(VALUE) = vld1_lane_u32((const uint32_t*)(src), (VALUE), (LANE))
657
658
// Hadamard transform
659
// Returns the weighted sum of the absolute value of transformed coefficients.
660
// w[] contains a row-major 4 by 4 symmetric matrix.
661
static int Disto4x4_NEON(const uint8_t* const a, const uint8_t* const b,
662
const uint16_t* const w) {
663
uint32x2_t d_in_ab_0123 = vdup_n_u32(0);
664
uint32x2_t d_in_ab_4567 = vdup_n_u32(0);
665
uint32x2_t d_in_ab_89ab = vdup_n_u32(0);
666
uint32x2_t d_in_ab_cdef = vdup_n_u32(0);
667
uint8x8x4_t d4_in;
668
669
// load data a, b
670
LOAD_LANE_32b(a + 0 * BPS, d_in_ab_0123, 0);
671
LOAD_LANE_32b(a + 1 * BPS, d_in_ab_4567, 0);
672
LOAD_LANE_32b(a + 2 * BPS, d_in_ab_89ab, 0);
673
LOAD_LANE_32b(a + 3 * BPS, d_in_ab_cdef, 0);
674
LOAD_LANE_32b(b + 0 * BPS, d_in_ab_0123, 1);
675
LOAD_LANE_32b(b + 1 * BPS, d_in_ab_4567, 1);
676
LOAD_LANE_32b(b + 2 * BPS, d_in_ab_89ab, 1);
677
LOAD_LANE_32b(b + 3 * BPS, d_in_ab_cdef, 1);
678
INIT_VECTOR4(d4_in,
679
vreinterpret_u8_u32(d_in_ab_0123),
680
vreinterpret_u8_u32(d_in_ab_4567),
681
vreinterpret_u8_u32(d_in_ab_89ab),
682
vreinterpret_u8_u32(d_in_ab_cdef));
683
684
{
685
// Vertical pass first to avoid a transpose (vertical and horizontal passes
686
// are commutative because w/kWeightY is symmetric) and subsequent
687
// transpose.
688
const int16x8x4_t q4_v = DistoVerticalPass_NEON(d4_in);
689
const int16x4x4_t d4_w = DistoLoadW_NEON(w);
690
// horizontal pass
691
const int16x8x4_t q4_t = DistoTranspose4x4S16_NEON(q4_v);
692
const int16x8x4_t q4_h = DistoHorizontalPass_NEON(q4_t);
693
int32x2_t d_sum = DistoSum_NEON(q4_h, d4_w);
694
695
// abs(sum2 - sum1) >> 5
696
d_sum = vabs_s32(d_sum);
697
d_sum = vshr_n_s32(d_sum, 5);
698
return vget_lane_s32(d_sum, 0);
699
}
700
}
701
#undef LOAD_LANE_32b
702
703
static int Disto16x16_NEON(const uint8_t* const a, const uint8_t* const b,
704
const uint16_t* const w) {
705
int D = 0;
706
int x, y;
707
for (y = 0; y < 16 * BPS; y += 4 * BPS) {
708
for (x = 0; x < 16; x += 4) {
709
D += Disto4x4_NEON(a + x + y, b + x + y, w);
710
}
711
}
712
return D;
713
}
714
715
//------------------------------------------------------------------------------
716
717
static void CollectHistogram_NEON(const uint8_t* ref, const uint8_t* pred,
718
int start_block, int end_block,
719
VP8Histogram* const histo) {
720
const uint16x8_t max_coeff_thresh = vdupq_n_u16(MAX_COEFF_THRESH);
721
int j;
722
int distribution[MAX_COEFF_THRESH + 1] = { 0 };
723
for (j = start_block; j < end_block; ++j) {
724
int16_t out[16];
725
FTransform_NEON(ref + VP8DspScan[j], pred + VP8DspScan[j], out);
726
{
727
int k;
728
const int16x8_t a0 = vld1q_s16(out + 0);
729
const int16x8_t b0 = vld1q_s16(out + 8);
730
const uint16x8_t a1 = vreinterpretq_u16_s16(vabsq_s16(a0));
731
const uint16x8_t b1 = vreinterpretq_u16_s16(vabsq_s16(b0));
732
const uint16x8_t a2 = vshrq_n_u16(a1, 3);
733
const uint16x8_t b2 = vshrq_n_u16(b1, 3);
734
const uint16x8_t a3 = vminq_u16(a2, max_coeff_thresh);
735
const uint16x8_t b3 = vminq_u16(b2, max_coeff_thresh);
736
vst1q_s16(out + 0, vreinterpretq_s16_u16(a3));
737
vst1q_s16(out + 8, vreinterpretq_s16_u16(b3));
738
// Convert coefficients to bin.
739
for (k = 0; k < 16; ++k) {
740
++distribution[out[k]];
741
}
742
}
743
}
744
VP8SetHistogramData(distribution, histo);
745
}
746
747
//------------------------------------------------------------------------------
748
749
static WEBP_INLINE void AccumulateSSE16_NEON(const uint8_t* const a,
750
const uint8_t* const b,
751
uint32x4_t* const sum) {
752
const uint8x16_t a0 = vld1q_u8(a);
753
const uint8x16_t b0 = vld1q_u8(b);
754
const uint8x16_t abs_diff = vabdq_u8(a0, b0);
755
const uint16x8_t prod1 = vmull_u8(vget_low_u8(abs_diff),
756
vget_low_u8(abs_diff));
757
const uint16x8_t prod2 = vmull_u8(vget_high_u8(abs_diff),
758
vget_high_u8(abs_diff));
759
/* pair-wise adds and widen */
760
const uint32x4_t sum1 = vpaddlq_u16(prod1);
761
const uint32x4_t sum2 = vpaddlq_u16(prod2);
762
*sum = vaddq_u32(*sum, vaddq_u32(sum1, sum2));
763
}
764
765
// Horizontal sum of all four uint32_t values in 'sum'.
766
static int SumToInt_NEON(uint32x4_t sum) {
767
const uint64x2_t sum2 = vpaddlq_u32(sum);
768
const uint64_t sum3 = vgetq_lane_u64(sum2, 0) + vgetq_lane_u64(sum2, 1);
769
return (int)sum3;
770
}
771
772
static int SSE16x16_NEON(const uint8_t* a, const uint8_t* b) {
773
uint32x4_t sum = vdupq_n_u32(0);
774
int y;
775
for (y = 0; y < 16; ++y) {
776
AccumulateSSE16_NEON(a + y * BPS, b + y * BPS, &sum);
777
}
778
return SumToInt_NEON(sum);
779
}
780
781
static int SSE16x8_NEON(const uint8_t* a, const uint8_t* b) {
782
uint32x4_t sum = vdupq_n_u32(0);
783
int y;
784
for (y = 0; y < 8; ++y) {
785
AccumulateSSE16_NEON(a + y * BPS, b + y * BPS, &sum);
786
}
787
return SumToInt_NEON(sum);
788
}
789
790
static int SSE8x8_NEON(const uint8_t* a, const uint8_t* b) {
791
uint32x4_t sum = vdupq_n_u32(0);
792
int y;
793
for (y = 0; y < 8; ++y) {
794
const uint8x8_t a0 = vld1_u8(a + y * BPS);
795
const uint8x8_t b0 = vld1_u8(b + y * BPS);
796
const uint8x8_t abs_diff = vabd_u8(a0, b0);
797
const uint16x8_t prod = vmull_u8(abs_diff, abs_diff);
798
sum = vpadalq_u16(sum, prod);
799
}
800
return SumToInt_NEON(sum);
801
}
802
803
static int SSE4x4_NEON(const uint8_t* a, const uint8_t* b) {
804
const uint8x16_t a0 = Load4x4_NEON(a);
805
const uint8x16_t b0 = Load4x4_NEON(b);
806
const uint8x16_t abs_diff = vabdq_u8(a0, b0);
807
const uint16x8_t prod1 = vmull_u8(vget_low_u8(abs_diff),
808
vget_low_u8(abs_diff));
809
const uint16x8_t prod2 = vmull_u8(vget_high_u8(abs_diff),
810
vget_high_u8(abs_diff));
811
/* pair-wise adds and widen */
812
const uint32x4_t sum1 = vpaddlq_u16(prod1);
813
const uint32x4_t sum2 = vpaddlq_u16(prod2);
814
return SumToInt_NEON(vaddq_u32(sum1, sum2));
815
}
816
817
//------------------------------------------------------------------------------
818
819
// Compilation with gcc-4.6.x is problematic for now.
820
#if !defined(WORK_AROUND_GCC)
821
822
static int16x8_t Quantize_NEON(int16_t* const in,
823
const VP8Matrix* const mtx, int offset) {
824
const uint16x8_t sharp = vld1q_u16(&mtx->sharpen_[offset]);
825
const uint16x8_t q = vld1q_u16(&mtx->q_[offset]);
826
const uint16x8_t iq = vld1q_u16(&mtx->iq_[offset]);
827
const uint32x4_t bias0 = vld1q_u32(&mtx->bias_[offset + 0]);
828
const uint32x4_t bias1 = vld1q_u32(&mtx->bias_[offset + 4]);
829
830
const int16x8_t a = vld1q_s16(in + offset); // in
831
const uint16x8_t b = vreinterpretq_u16_s16(vabsq_s16(a)); // coeff = abs(in)
832
const int16x8_t sign = vshrq_n_s16(a, 15); // sign
833
const uint16x8_t c = vaddq_u16(b, sharp); // + sharpen
834
const uint32x4_t m0 = vmull_u16(vget_low_u16(c), vget_low_u16(iq));
835
const uint32x4_t m1 = vmull_u16(vget_high_u16(c), vget_high_u16(iq));
836
const uint32x4_t m2 = vhaddq_u32(m0, bias0);
837
const uint32x4_t m3 = vhaddq_u32(m1, bias1); // (coeff * iQ + bias) >> 1
838
const uint16x8_t c0 = vcombine_u16(vshrn_n_u32(m2, 16),
839
vshrn_n_u32(m3, 16)); // QFIX=17 = 16+1
840
const uint16x8_t c1 = vminq_u16(c0, vdupq_n_u16(MAX_LEVEL));
841
const int16x8_t c2 = veorq_s16(vreinterpretq_s16_u16(c1), sign);
842
const int16x8_t c3 = vsubq_s16(c2, sign); // restore sign
843
const int16x8_t c4 = vmulq_s16(c3, vreinterpretq_s16_u16(q));
844
vst1q_s16(in + offset, c4);
845
assert(QFIX == 17); // this function can't work as is if QFIX != 16+1
846
return c3;
847
}
848
849
static const uint8_t kShuffles[4][8] = {
850
{ 0, 1, 2, 3, 8, 9, 16, 17 },
851
{ 10, 11, 4, 5, 6, 7, 12, 13 },
852
{ 18, 19, 24, 25, 26, 27, 20, 21 },
853
{ 14, 15, 22, 23, 28, 29, 30, 31 }
854
};
855
856
static int QuantizeBlock_NEON(int16_t in[16], int16_t out[16],
857
const VP8Matrix* const mtx) {
858
const int16x8_t out0 = Quantize_NEON(in, mtx, 0);
859
const int16x8_t out1 = Quantize_NEON(in, mtx, 8);
860
uint8x8x4_t shuffles;
861
// vtbl?_u8 are marked unavailable for iOS arm64 with Xcode < 6.3, use
862
// non-standard versions there.
863
#if defined(__APPLE__) && defined(__aarch64__) && \
864
defined(__apple_build_version__) && (__apple_build_version__< 6020037)
865
uint8x16x2_t all_out;
866
INIT_VECTOR2(all_out, vreinterpretq_u8_s16(out0), vreinterpretq_u8_s16(out1));
867
INIT_VECTOR4(shuffles,
868
vtbl2q_u8(all_out, vld1_u8(kShuffles[0])),
869
vtbl2q_u8(all_out, vld1_u8(kShuffles[1])),
870
vtbl2q_u8(all_out, vld1_u8(kShuffles[2])),
871
vtbl2q_u8(all_out, vld1_u8(kShuffles[3])));
872
#else
873
uint8x8x4_t all_out;
874
INIT_VECTOR4(all_out,
875
vreinterpret_u8_s16(vget_low_s16(out0)),
876
vreinterpret_u8_s16(vget_high_s16(out0)),
877
vreinterpret_u8_s16(vget_low_s16(out1)),
878
vreinterpret_u8_s16(vget_high_s16(out1)));
879
INIT_VECTOR4(shuffles,
880
vtbl4_u8(all_out, vld1_u8(kShuffles[0])),
881
vtbl4_u8(all_out, vld1_u8(kShuffles[1])),
882
vtbl4_u8(all_out, vld1_u8(kShuffles[2])),
883
vtbl4_u8(all_out, vld1_u8(kShuffles[3])));
884
#endif
885
// Zigzag reordering
886
vst1_u8((uint8_t*)(out + 0), shuffles.val[0]);
887
vst1_u8((uint8_t*)(out + 4), shuffles.val[1]);
888
vst1_u8((uint8_t*)(out + 8), shuffles.val[2]);
889
vst1_u8((uint8_t*)(out + 12), shuffles.val[3]);
890
// test zeros
891
if (*(uint64_t*)(out + 0) != 0) return 1;
892
if (*(uint64_t*)(out + 4) != 0) return 1;
893
if (*(uint64_t*)(out + 8) != 0) return 1;
894
if (*(uint64_t*)(out + 12) != 0) return 1;
895
return 0;
896
}
897
898
static int Quantize2Blocks_NEON(int16_t in[32], int16_t out[32],
899
const VP8Matrix* const mtx) {
900
int nz;
901
nz = QuantizeBlock_NEON(in + 0 * 16, out + 0 * 16, mtx) << 0;
902
nz |= QuantizeBlock_NEON(in + 1 * 16, out + 1 * 16, mtx) << 1;
903
return nz;
904
}
905
906
#endif // !WORK_AROUND_GCC
907
908
//------------------------------------------------------------------------------
909
// Entry point
910
911
extern void VP8EncDspInitNEON(void);
912
913
WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspInitNEON(void) {
914
VP8ITransform = ITransform_NEON;
915
VP8FTransform = FTransform_NEON;
916
917
VP8FTransformWHT = FTransformWHT_NEON;
918
919
VP8TDisto4x4 = Disto4x4_NEON;
920
VP8TDisto16x16 = Disto16x16_NEON;
921
VP8CollectHistogram = CollectHistogram_NEON;
922
923
VP8SSE16x16 = SSE16x16_NEON;
924
VP8SSE16x8 = SSE16x8_NEON;
925
VP8SSE8x8 = SSE8x8_NEON;
926
VP8SSE4x4 = SSE4x4_NEON;
927
928
#if !defined(WORK_AROUND_GCC)
929
VP8EncQuantizeBlock = QuantizeBlock_NEON;
930
VP8EncQuantize2Blocks = Quantize2Blocks_NEON;
931
#endif
932
}
933
934
#else // !WEBP_USE_NEON
935
936
WEBP_DSP_INIT_STUB(VP8EncDspInitNEON)
937
938
#endif // WEBP_USE_NEON
939
940