CoCalc -- enc

GitHub Repository: godotengine/godot
Path: blob/master/thirdparty/libwebp/src/dsp/enc_neon.c
⁹⁹¹³ views
1
// Copyright 2012 Google Inc. All Rights Reserved.
2
//
3
// Use of this source code is governed by a BSD-style license
4
// that can be found in the COPYING file in the root of the source
5
// tree. An additional intellectual property rights grant can be found
6
// in the file PATENTS. All contributing project authors may
7
// be found in the AUTHORS file in the root of the source tree.
8
// -----------------------------------------------------------------------------
9
//
10
// ARM NEON version of speed-critical encoding functions.
11
//
12
// adapted from libvpx (https://www.webmproject.org/code/)
13

14
#include "src/dsp/dsp.h"
15

16
#if defined(WEBP_USE_NEON)
17

18
#include <assert.h>
19

20
#include "src/dsp/neon.h"
21
#include "src/enc/vp8i_enc.h"
22

23
//------------------------------------------------------------------------------
24
// Transforms (Paragraph 14.4)
25

26
// Inverse transform.
27
// This code is pretty much the same as TransformOne in the dec_neon.c, except
28
// for subtraction to *ref. See the comments there for algorithmic explanations.
29

30
static const int16_t kC1 = WEBP_TRANSFORM_AC3_C1;
31
static const int16_t kC2 =
32
    WEBP_TRANSFORM_AC3_C2 / 2;  // half of kC2, actually. See comment above.
33

34
// This code works but is *slower* than the inlined-asm version below
35
// (with gcc-4.6). So we disable it for now. Later, it'll be conditional to
36
// WEBP_USE_INTRINSICS define.
37
// With gcc-4.8, it's a little faster speed than inlined-assembly.
38
#if defined(WEBP_USE_INTRINSICS)
39

40
// Treats 'v' as an uint8x8_t and zero extends to an int16x8_t.
41
static WEBP_INLINE int16x8_t ConvertU8ToS16_NEON(uint32x2_t v) {
42
  return vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_u32(v)));
43
}
44

45
// Performs unsigned 8b saturation on 'dst01' and 'dst23' storing the result
46
// to the corresponding rows of 'dst'.
47
static WEBP_INLINE void SaturateAndStore4x4_NEON(uint8_t* const dst,
48
                                                 const int16x8_t dst01,
49
                                                 const int16x8_t dst23) {
50
  // Unsigned saturate to 8b.
51
  const uint8x8_t dst01_u8 = vqmovun_s16(dst01);
52
  const uint8x8_t dst23_u8 = vqmovun_s16(dst23);
53

54
  // Store the results.
55
  vst1_lane_u32((uint32_t*)(dst + 0 * BPS), vreinterpret_u32_u8(dst01_u8), 0);
56
  vst1_lane_u32((uint32_t*)(dst + 1 * BPS), vreinterpret_u32_u8(dst01_u8), 1);
57
  vst1_lane_u32((uint32_t*)(dst + 2 * BPS), vreinterpret_u32_u8(dst23_u8), 0);
58
  vst1_lane_u32((uint32_t*)(dst + 3 * BPS), vreinterpret_u32_u8(dst23_u8), 1);
59
}
60

61
static WEBP_INLINE void Add4x4_NEON(const int16x8_t row01,
62
                                    const int16x8_t row23,
63
                                    const uint8_t* WEBP_RESTRICT const ref,
64
                                    uint8_t* WEBP_RESTRICT const dst) {
65
  uint32x2_t dst01 = vdup_n_u32(0);
66
  uint32x2_t dst23 = vdup_n_u32(0);
67

68
  // Load the source pixels.
69
  dst01 = vld1_lane_u32((uint32_t*)(ref + 0 * BPS), dst01, 0);
70
  dst23 = vld1_lane_u32((uint32_t*)(ref + 2 * BPS), dst23, 0);
71
  dst01 = vld1_lane_u32((uint32_t*)(ref + 1 * BPS), dst01, 1);
72
  dst23 = vld1_lane_u32((uint32_t*)(ref + 3 * BPS), dst23, 1);
73

74
  {
75
    // Convert to 16b.
76
    const int16x8_t dst01_s16 = ConvertU8ToS16_NEON(dst01);
77
    const int16x8_t dst23_s16 = ConvertU8ToS16_NEON(dst23);
78

79
    // Descale with rounding.
80
    const int16x8_t out01 = vrsraq_n_s16(dst01_s16, row01, 3);
81
    const int16x8_t out23 = vrsraq_n_s16(dst23_s16, row23, 3);
82
    // Add the inverse transform.
83
    SaturateAndStore4x4_NEON(dst, out01, out23);
84
  }
85
}
86

87
static WEBP_INLINE void Transpose8x2_NEON(const int16x8_t in0,
88
                                          const int16x8_t in1,
89
                                          int16x8x2_t* const out) {
90
  // a0 a1 a2 a3 | b0 b1 b2 b3   => a0 b0 c0 d0 | a1 b1 c1 d1
91
  // c0 c1 c2 c3 | d0 d1 d2 d3      a2 b2 c2 d2 | a3 b3 c3 d3
92
  const int16x8x2_t tmp0 = vzipq_s16(in0, in1);   // a0 c0 a1 c1 a2 c2 ...
93
                                                  // b0 d0 b1 d1 b2 d2 ...
94
  *out = vzipq_s16(tmp0.val[0], tmp0.val[1]);
95
}
96

97
static WEBP_INLINE void TransformPass_NEON(int16x8x2_t* const rows) {
98
  // {rows} = in0 | in4
99
  //          in8 | in12
100
  // B1 = in4 | in12
101
  const int16x8_t B1 =
102
      vcombine_s16(vget_high_s16(rows->val[0]), vget_high_s16(rows->val[1]));
103
  // C0 = kC1 * in4 | kC1 * in12
104
  // C1 = kC2 * in4 | kC2 * in12
105
  const int16x8_t C0 = vsraq_n_s16(B1, vqdmulhq_n_s16(B1, kC1), 1);
106
  const int16x8_t C1 = vqdmulhq_n_s16(B1, kC2);
107
  const int16x4_t a = vqadd_s16(vget_low_s16(rows->val[0]),
108
                                vget_low_s16(rows->val[1]));   // in0 + in8
109
  const int16x4_t b = vqsub_s16(vget_low_s16(rows->val[0]),
110
                                vget_low_s16(rows->val[1]));   // in0 - in8
111
  // c = kC2 * in4 - kC1 * in12
112
  // d = kC1 * in4 + kC2 * in12
113
  const int16x4_t c = vqsub_s16(vget_low_s16(C1), vget_high_s16(C0));
114
  const int16x4_t d = vqadd_s16(vget_low_s16(C0), vget_high_s16(C1));
115
  const int16x8_t D0 = vcombine_s16(a, b);      // D0 = a | b
116
  const int16x8_t D1 = vcombine_s16(d, c);      // D1 = d | c
117
  const int16x8_t E0 = vqaddq_s16(D0, D1);      // a+d | b+c
118
  const int16x8_t E_tmp = vqsubq_s16(D0, D1);   // a-d | b-c
119
  const int16x8_t E1 = vcombine_s16(vget_high_s16(E_tmp), vget_low_s16(E_tmp));
120
  Transpose8x2_NEON(E0, E1, rows);
121
}
122

123
static void ITransformOne_NEON(const uint8_t* WEBP_RESTRICT ref,
124
                               const int16_t* WEBP_RESTRICT in,
125
                               uint8_t* WEBP_RESTRICT dst) {
126
  int16x8x2_t rows;
127
  INIT_VECTOR2(rows, vld1q_s16(in + 0), vld1q_s16(in + 8));
128
  TransformPass_NEON(&rows);
129
  TransformPass_NEON(&rows);
130
  Add4x4_NEON(rows.val[0], rows.val[1], ref, dst);
131
}
132

133
#else
134

135
static void ITransformOne_NEON(const uint8_t* WEBP_RESTRICT ref,
136
                               const int16_t* WEBP_RESTRICT in,
137
                               uint8_t* WEBP_RESTRICT dst) {
138
  const int kBPS = BPS;
139
  const int16_t kC1C2[] = { kC1, kC2, 0, 0 };
140

141
  __asm__ volatile (
142
    "vld1.16         {q1, q2}, [%[in]]           \n"
143
    "vld1.16         {d0}, [%[kC1C2]]            \n"
144

145
    // d2: in[0]
146
    // d3: in[8]
147
    // d4: in[4]
148
    // d5: in[12]
149
    "vswp            d3, d4                      \n"
150

151
    // q8 = {in[4], in[12]} * kC1 * 2 >> 16
152
    // q9 = {in[4], in[12]} * kC2 >> 16
153
    "vqdmulh.s16     q8, q2, d0[0]               \n"
154
    "vqdmulh.s16     q9, q2, d0[1]               \n"
155

156
    // d22 = a = in[0] + in[8]
157
    // d23 = b = in[0] - in[8]
158
    "vqadd.s16       d22, d2, d3                 \n"
159
    "vqsub.s16       d23, d2, d3                 \n"
160

161
    //  q8 = in[4]/[12] * kC1 >> 16
162
    "vshr.s16        q8, q8, #1                  \n"
163

164
    // Add {in[4], in[12]} back after the multiplication.
165
    "vqadd.s16       q8, q2, q8                  \n"
166

167
    // d20 = c = in[4]*kC2 - in[12]*kC1
168
    // d21 = d = in[4]*kC1 + in[12]*kC2
169
    "vqsub.s16       d20, d18, d17               \n"
170
    "vqadd.s16       d21, d19, d16               \n"
171

172
    // d2 = tmp[0] = a + d
173
    // d3 = tmp[1] = b + c
174
    // d4 = tmp[2] = b - c
175
    // d5 = tmp[3] = a - d
176
    "vqadd.s16       d2, d22, d21                \n"
177
    "vqadd.s16       d3, d23, d20                \n"
178
    "vqsub.s16       d4, d23, d20                \n"
179
    "vqsub.s16       d5, d22, d21                \n"
180

181
    "vzip.16         q1, q2                      \n"
182
    "vzip.16         q1, q2                      \n"
183

184
    "vswp            d3, d4                      \n"
185

186
    // q8 = {tmp[4], tmp[12]} * kC1 * 2 >> 16
187
    // q9 = {tmp[4], tmp[12]} * kC2 >> 16
188
    "vqdmulh.s16     q8, q2, d0[0]               \n"
189
    "vqdmulh.s16     q9, q2, d0[1]               \n"
190

191
    // d22 = a = tmp[0] + tmp[8]
192
    // d23 = b = tmp[0] - tmp[8]
193
    "vqadd.s16       d22, d2, d3                 \n"
194
    "vqsub.s16       d23, d2, d3                 \n"
195

196
    "vshr.s16        q8, q8, #1                  \n"
197
    "vqadd.s16       q8, q2, q8                  \n"
198

199
    // d20 = c = in[4]*kC2 - in[12]*kC1
200
    // d21 = d = in[4]*kC1 + in[12]*kC2
201
    "vqsub.s16       d20, d18, d17               \n"
202
    "vqadd.s16       d21, d19, d16               \n"
203

204
    // d2 = tmp[0] = a + d
205
    // d3 = tmp[1] = b + c
206
    // d4 = tmp[2] = b - c
207
    // d5 = tmp[3] = a - d
208
    "vqadd.s16       d2, d22, d21                \n"
209
    "vqadd.s16       d3, d23, d20                \n"
210
    "vqsub.s16       d4, d23, d20                \n"
211
    "vqsub.s16       d5, d22, d21                \n"
212

213
    "vld1.32         d6[0], [%[ref]], %[kBPS]    \n"
214
    "vld1.32         d6[1], [%[ref]], %[kBPS]    \n"
215
    "vld1.32         d7[0], [%[ref]], %[kBPS]    \n"
216
    "vld1.32         d7[1], [%[ref]], %[kBPS]    \n"
217

218
    "sub         %[ref], %[ref], %[kBPS], lsl #2 \n"
219

220
    // (val) + 4 >> 3
221
    "vrshr.s16       d2, d2, #3                  \n"
222
    "vrshr.s16       d3, d3, #3                  \n"
223
    "vrshr.s16       d4, d4, #3                  \n"
224
    "vrshr.s16       d5, d5, #3                  \n"
225

226
    "vzip.16         q1, q2                      \n"
227
    "vzip.16         q1, q2                      \n"
228

229
    // Must accumulate before saturating
230
    "vmovl.u8        q8, d6                      \n"
231
    "vmovl.u8        q9, d7                      \n"
232

233
    "vqadd.s16       q1, q1, q8                  \n"
234
    "vqadd.s16       q2, q2, q9                  \n"
235

236
    "vqmovun.s16     d0, q1                      \n"
237
    "vqmovun.s16     d1, q2                      \n"
238

239
    "vst1.32         d0[0], [%[dst]], %[kBPS]    \n"
240
    "vst1.32         d0[1], [%[dst]], %[kBPS]    \n"
241
    "vst1.32         d1[0], [%[dst]], %[kBPS]    \n"
242
    "vst1.32         d1[1], [%[dst]]             \n"
243

244
    : [in] "+r"(in), [dst] "+r"(dst)               // modified registers
245
    : [kBPS] "r"(kBPS), [kC1C2] "r"(kC1C2), [ref] "r"(ref)  // constants
246
    : "memory", "q0", "q1", "q2", "q8", "q9", "q10", "q11"  // clobbered
247
  );
248
}
249

250
#endif    // WEBP_USE_INTRINSICS
251

252
static void ITransform_NEON(const uint8_t* WEBP_RESTRICT ref,
253
                            const int16_t* WEBP_RESTRICT in,
254
                            uint8_t* WEBP_RESTRICT dst, int do_two) {
255
  ITransformOne_NEON(ref, in, dst);
256
  if (do_two) {
257
    ITransformOne_NEON(ref + 4, in + 16, dst + 4);
258
  }
259
}
260

261
// Load all 4x4 pixels into a single uint8x16_t variable.
262
static uint8x16_t Load4x4_NEON(const uint8_t* src) {
263
  uint32x4_t out = vdupq_n_u32(0);
264
  out = vld1q_lane_u32((const uint32_t*)(src + 0 * BPS), out, 0);
265
  out = vld1q_lane_u32((const uint32_t*)(src + 1 * BPS), out, 1);
266
  out = vld1q_lane_u32((const uint32_t*)(src + 2 * BPS), out, 2);
267
  out = vld1q_lane_u32((const uint32_t*)(src + 3 * BPS), out, 3);
268
  return vreinterpretq_u8_u32(out);
269
}
270

271
// Forward transform.
272

273
#if defined(WEBP_USE_INTRINSICS)
274

275
static WEBP_INLINE void Transpose4x4_S16_NEON(const int16x4_t A,
276
                                              const int16x4_t B,
277
                                              const int16x4_t C,
278
                                              const int16x4_t D,
279
                                              int16x8_t* const out01,
280
                                              int16x8_t* const out32) {
281
  const int16x4x2_t AB = vtrn_s16(A, B);
282
  const int16x4x2_t CD = vtrn_s16(C, D);
283
  const int32x2x2_t tmp02 = vtrn_s32(vreinterpret_s32_s16(AB.val[0]),
284
                                     vreinterpret_s32_s16(CD.val[0]));
285
  const int32x2x2_t tmp13 = vtrn_s32(vreinterpret_s32_s16(AB.val[1]),
286
                                     vreinterpret_s32_s16(CD.val[1]));
287
  *out01 = vreinterpretq_s16_s64(
288
      vcombine_s64(vreinterpret_s64_s32(tmp02.val[0]),
289
                   vreinterpret_s64_s32(tmp13.val[0])));
290
  *out32 = vreinterpretq_s16_s64(
291
      vcombine_s64(vreinterpret_s64_s32(tmp13.val[1]),
292
                   vreinterpret_s64_s32(tmp02.val[1])));
293
}
294

295
static WEBP_INLINE int16x8_t DiffU8ToS16_NEON(const uint8x8_t a,
296
                                              const uint8x8_t b) {
297
  return vreinterpretq_s16_u16(vsubl_u8(a, b));
298
}
299

300
static void FTransform_NEON(const uint8_t* WEBP_RESTRICT src,
301
                            const uint8_t* WEBP_RESTRICT ref,
302
                            int16_t* WEBP_RESTRICT out) {
303
  int16x8_t d0d1, d3d2;   // working 4x4 int16 variables
304
  {
305
    const uint8x16_t S0 = Load4x4_NEON(src);
306
    const uint8x16_t R0 = Load4x4_NEON(ref);
307
    const int16x8_t D0D1 = DiffU8ToS16_NEON(vget_low_u8(S0), vget_low_u8(R0));
308
    const int16x8_t D2D3 = DiffU8ToS16_NEON(vget_high_u8(S0), vget_high_u8(R0));
309
    const int16x4_t D0 = vget_low_s16(D0D1);
310
    const int16x4_t D1 = vget_high_s16(D0D1);
311
    const int16x4_t D2 = vget_low_s16(D2D3);
312
    const int16x4_t D3 = vget_high_s16(D2D3);
313
    Transpose4x4_S16_NEON(D0, D1, D2, D3, &d0d1, &d3d2);
314
  }
315
  {    // 1rst pass
316
    const int32x4_t kCst937 = vdupq_n_s32(937);
317
    const int32x4_t kCst1812 = vdupq_n_s32(1812);
318
    const int16x8_t a0a1 = vaddq_s16(d0d1, d3d2);   // d0+d3 | d1+d2   (=a0|a1)
319
    const int16x8_t a3a2 = vsubq_s16(d0d1, d3d2);   // d0-d3 | d1-d2   (=a3|a2)
320
    const int16x8_t a0a1_2 = vshlq_n_s16(a0a1, 3);
321
    const int16x4_t tmp0 = vadd_s16(vget_low_s16(a0a1_2),
322
                                    vget_high_s16(a0a1_2));
323
    const int16x4_t tmp2 = vsub_s16(vget_low_s16(a0a1_2),
324
                                    vget_high_s16(a0a1_2));
325
    const int32x4_t a3_2217 = vmull_n_s16(vget_low_s16(a3a2), 2217);
326
    const int32x4_t a2_2217 = vmull_n_s16(vget_high_s16(a3a2), 2217);
327
    const int32x4_t a2_p_a3 = vmlal_n_s16(a2_2217, vget_low_s16(a3a2), 5352);
328
    const int32x4_t a3_m_a2 = vmlsl_n_s16(a3_2217, vget_high_s16(a3a2), 5352);
329
    const int16x4_t tmp1 = vshrn_n_s32(vaddq_s32(a2_p_a3, kCst1812), 9);
330
    const int16x4_t tmp3 = vshrn_n_s32(vaddq_s32(a3_m_a2, kCst937), 9);
331
    Transpose4x4_S16_NEON(tmp0, tmp1, tmp2, tmp3, &d0d1, &d3d2);
332
  }
333
  {    // 2nd pass
334
    // the (1<<16) addition is for the replacement: a3!=0  <-> 1-(a3==0)
335
    const int32x4_t kCst12000 = vdupq_n_s32(12000 + (1 << 16));
336
    const int32x4_t kCst51000 = vdupq_n_s32(51000);
337
    const int16x8_t a0a1 = vaddq_s16(d0d1, d3d2);   // d0+d3 | d1+d2   (=a0|a1)
338
    const int16x8_t a3a2 = vsubq_s16(d0d1, d3d2);   // d0-d3 | d1-d2   (=a3|a2)
339
    const int16x4_t a0_k7 = vadd_s16(vget_low_s16(a0a1), vdup_n_s16(7));
340
    const int16x4_t out0 = vshr_n_s16(vadd_s16(a0_k7, vget_high_s16(a0a1)), 4);
341
    const int16x4_t out2 = vshr_n_s16(vsub_s16(a0_k7, vget_high_s16(a0a1)), 4);
342
    const int32x4_t a3_2217 = vmull_n_s16(vget_low_s16(a3a2), 2217);
343
    const int32x4_t a2_2217 = vmull_n_s16(vget_high_s16(a3a2), 2217);
344
    const int32x4_t a2_p_a3 = vmlal_n_s16(a2_2217, vget_low_s16(a3a2), 5352);
345
    const int32x4_t a3_m_a2 = vmlsl_n_s16(a3_2217, vget_high_s16(a3a2), 5352);
346
    const int16x4_t tmp1 = vaddhn_s32(a2_p_a3, kCst12000);
347
    const int16x4_t out3 = vaddhn_s32(a3_m_a2, kCst51000);
348
    const int16x4_t a3_eq_0 =
349
        vreinterpret_s16_u16(vceq_s16(vget_low_s16(a3a2), vdup_n_s16(0)));
350
    const int16x4_t out1 = vadd_s16(tmp1, a3_eq_0);
351
    vst1_s16(out +  0, out0);
352
    vst1_s16(out +  4, out1);
353
    vst1_s16(out +  8, out2);
354
    vst1_s16(out + 12, out3);
355
  }
356
}
357

358
#else
359

360
// adapted from vp8/encoder/arm/neon/shortfdct_neon.asm
361
static const int16_t kCoeff16[] = {
362
  5352,  5352,  5352, 5352, 2217,  2217,  2217, 2217
363
};
364
static const int32_t kCoeff32[] = {
365
   1812,  1812,  1812,  1812,
366
    937,   937,   937,   937,
367
  12000, 12000, 12000, 12000,
368
  51000, 51000, 51000, 51000
369
};
370

371
static void FTransform_NEON(const uint8_t* WEBP_RESTRICT src,
372
                            const uint8_t* WEBP_RESTRICT ref,
373
                            int16_t* WEBP_RESTRICT out) {
374
  const int kBPS = BPS;
375
  const uint8_t* src_ptr = src;
376
  const uint8_t* ref_ptr = ref;
377
  const int16_t* coeff16 = kCoeff16;
378
  const int32_t* coeff32 = kCoeff32;
379

380
  __asm__ volatile (
381
    // load src into q4, q5 in high half
382
    "vld1.8 {d8},  [%[src_ptr]], %[kBPS]      \n"
383
    "vld1.8 {d10}, [%[src_ptr]], %[kBPS]      \n"
384
    "vld1.8 {d9},  [%[src_ptr]], %[kBPS]      \n"
385
    "vld1.8 {d11}, [%[src_ptr]]               \n"
386

387
    // load ref into q6, q7 in high half
388
    "vld1.8 {d12}, [%[ref_ptr]], %[kBPS]      \n"
389
    "vld1.8 {d14}, [%[ref_ptr]], %[kBPS]      \n"
390
    "vld1.8 {d13}, [%[ref_ptr]], %[kBPS]      \n"
391
    "vld1.8 {d15}, [%[ref_ptr]]               \n"
392

393
    // Pack the high values in to q4 and q6
394
    "vtrn.32     q4, q5                       \n"
395
    "vtrn.32     q6, q7                       \n"
396

397
    // d[0-3] = src - ref
398
    "vsubl.u8    q0, d8, d12                  \n"
399
    "vsubl.u8    q1, d9, d13                  \n"
400

401
    // load coeff16 into q8(d16=5352, d17=2217)
402
    "vld1.16     {q8}, [%[coeff16]]           \n"
403

404
    // load coeff32 high half into q9 = 1812, q10 = 937
405
    "vld1.32     {q9, q10}, [%[coeff32]]!     \n"
406

407
    // load coeff32 low half into q11=12000, q12=51000
408
    "vld1.32     {q11,q12}, [%[coeff32]]      \n"
409

410
    // part 1
411
    // Transpose. Register dN is the same as dN in C
412
    "vtrn.32         d0, d2                   \n"
413
    "vtrn.32         d1, d3                   \n"
414
    "vtrn.16         d0, d1                   \n"
415
    "vtrn.16         d2, d3                   \n"
416

417
    "vadd.s16        d4, d0, d3               \n" // a0 = d0 + d3
418
    "vadd.s16        d5, d1, d2               \n" // a1 = d1 + d2
419
    "vsub.s16        d6, d1, d2               \n" // a2 = d1 - d2
420
    "vsub.s16        d7, d0, d3               \n" // a3 = d0 - d3
421

422
    "vadd.s16        d0, d4, d5               \n" // a0 + a1
423
    "vshl.s16        d0, d0, #3               \n" // temp[0+i*4] = (a0+a1) << 3
424
    "vsub.s16        d2, d4, d5               \n" // a0 - a1
425
    "vshl.s16        d2, d2, #3               \n" // (temp[2+i*4] = (a0-a1) << 3
426

427
    "vmlal.s16       q9, d7, d16              \n" // a3*5352 + 1812
428
    "vmlal.s16       q10, d7, d17             \n" // a3*2217 + 937
429
    "vmlal.s16       q9, d6, d17              \n" // a2*2217 + a3*5352 + 1812
430
    "vmlsl.s16       q10, d6, d16             \n" // a3*2217 + 937 - a2*5352
431

432
    // temp[1+i*4] = (d2*2217 + d3*5352 + 1812) >> 9
433
    // temp[3+i*4] = (d3*2217 + 937 - d2*5352) >> 9
434
    "vshrn.s32       d1, q9, #9               \n"
435
    "vshrn.s32       d3, q10, #9              \n"
436

437
    // part 2
438
    // transpose d0=ip[0], d1=ip[4], d2=ip[8], d3=ip[12]
439
    "vtrn.32         d0, d2                   \n"
440
    "vtrn.32         d1, d3                   \n"
441
    "vtrn.16         d0, d1                   \n"
442
    "vtrn.16         d2, d3                   \n"
443

444
    "vmov.s16        d26, #7                  \n"
445

446
    "vadd.s16        d4, d0, d3               \n" // a1 = ip[0] + ip[12]
447
    "vadd.s16        d5, d1, d2               \n" // b1 = ip[4] + ip[8]
448
    "vsub.s16        d6, d1, d2               \n" // c1 = ip[4] - ip[8]
449
    "vadd.s16        d4, d4, d26              \n" // a1 + 7
450
    "vsub.s16        d7, d0, d3               \n" // d1 = ip[0] - ip[12]
451

452
    "vadd.s16        d0, d4, d5               \n" // op[0] = a1 + b1 + 7
453
    "vsub.s16        d2, d4, d5               \n" // op[8] = a1 - b1 + 7
454

455
    "vmlal.s16       q11, d7, d16             \n" // d1*5352 + 12000
456
    "vmlal.s16       q12, d7, d17             \n" // d1*2217 + 51000
457

458
    "vceq.s16        d4, d7, #0               \n"
459

460
    "vshr.s16        d0, d0, #4               \n"
461
    "vshr.s16        d2, d2, #4               \n"
462

463
    "vmlal.s16       q11, d6, d17             \n" // c1*2217 + d1*5352 + 12000
464
    "vmlsl.s16       q12, d6, d16             \n" // d1*2217 - c1*5352 + 51000
465

466
    "vmvn            d4, d4                   \n" // !(d1 == 0)
467
    // op[4] = (c1*2217 + d1*5352 + 12000)>>16
468
    "vshrn.s32       d1, q11, #16             \n"
469
    // op[4] += (d1!=0)
470
    "vsub.s16        d1, d1, d4               \n"
471
    // op[12]= (d1*2217 - c1*5352 + 51000)>>16
472
    "vshrn.s32       d3, q12, #16             \n"
473

474
    // set result to out array
475
    "vst1.16         {q0, q1}, [%[out]]   \n"
476
    : [src_ptr] "+r"(src_ptr), [ref_ptr] "+r"(ref_ptr),
477
      [coeff32] "+r"(coeff32)          // modified registers
478
    : [kBPS] "r"(kBPS), [coeff16] "r"(coeff16),
479
      [out] "r"(out)                   // constants
480
    : "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9",
481
      "q10", "q11", "q12", "q13"       // clobbered
482
  );
483
}
484

485
#endif
486

487
#define LOAD_LANE_16b(VALUE, LANE) do {             \
488
  (VALUE) = vld1_lane_s16(src, (VALUE), (LANE));    \
489
  src += stride;                                    \
490
} while (0)
491

492
static void FTransformWHT_NEON(const int16_t* WEBP_RESTRICT src,
493
                               int16_t* WEBP_RESTRICT out) {
494
  const int stride = 16;
495
  const int16x4_t zero = vdup_n_s16(0);
496
  int32x4x4_t tmp0;
497
  int16x4x4_t in;
498
  INIT_VECTOR4(in, zero, zero, zero, zero);
499
  LOAD_LANE_16b(in.val[0], 0);
500
  LOAD_LANE_16b(in.val[1], 0);
501
  LOAD_LANE_16b(in.val[2], 0);
502
  LOAD_LANE_16b(in.val[3], 0);
503
  LOAD_LANE_16b(in.val[0], 1);
504
  LOAD_LANE_16b(in.val[1], 1);
505
  LOAD_LANE_16b(in.val[2], 1);
506
  LOAD_LANE_16b(in.val[3], 1);
507
  LOAD_LANE_16b(in.val[0], 2);
508
  LOAD_LANE_16b(in.val[1], 2);
509
  LOAD_LANE_16b(in.val[2], 2);
510
  LOAD_LANE_16b(in.val[3], 2);
511
  LOAD_LANE_16b(in.val[0], 3);
512
  LOAD_LANE_16b(in.val[1], 3);
513
  LOAD_LANE_16b(in.val[2], 3);
514
  LOAD_LANE_16b(in.val[3], 3);
515

516
  {
517
    // a0 = in[0 * 16] + in[2 * 16]
518
    // a1 = in[1 * 16] + in[3 * 16]
519
    // a2 = in[1 * 16] - in[3 * 16]
520
    // a3 = in[0 * 16] - in[2 * 16]
521
    const int32x4_t a0 = vaddl_s16(in.val[0], in.val[2]);
522
    const int32x4_t a1 = vaddl_s16(in.val[1], in.val[3]);
523
    const int32x4_t a2 = vsubl_s16(in.val[1], in.val[3]);
524
    const int32x4_t a3 = vsubl_s16(in.val[0], in.val[2]);
525
    tmp0.val[0] = vaddq_s32(a0, a1);
526
    tmp0.val[1] = vaddq_s32(a3, a2);
527
    tmp0.val[2] = vsubq_s32(a3, a2);
528
    tmp0.val[3] = vsubq_s32(a0, a1);
529
  }
530
  {
531
    const int32x4x4_t tmp1 = Transpose4x4_NEON(tmp0);
532
    // a0 = tmp[0 + i] + tmp[ 8 + i]
533
    // a1 = tmp[4 + i] + tmp[12 + i]
534
    // a2 = tmp[4 + i] - tmp[12 + i]
535
    // a3 = tmp[0 + i] - tmp[ 8 + i]
536
    const int32x4_t a0 = vaddq_s32(tmp1.val[0], tmp1.val[2]);
537
    const int32x4_t a1 = vaddq_s32(tmp1.val[1], tmp1.val[3]);
538
    const int32x4_t a2 = vsubq_s32(tmp1.val[1], tmp1.val[3]);
539
    const int32x4_t a3 = vsubq_s32(tmp1.val[0], tmp1.val[2]);
540
    const int32x4_t b0 = vhaddq_s32(a0, a1);  // (a0 + a1) >> 1
541
    const int32x4_t b1 = vhaddq_s32(a3, a2);  // (a3 + a2) >> 1
542
    const int32x4_t b2 = vhsubq_s32(a3, a2);  // (a3 - a2) >> 1
543
    const int32x4_t b3 = vhsubq_s32(a0, a1);  // (a0 - a1) >> 1
544
    const int16x4_t out0 = vmovn_s32(b0);
545
    const int16x4_t out1 = vmovn_s32(b1);
546
    const int16x4_t out2 = vmovn_s32(b2);
547
    const int16x4_t out3 = vmovn_s32(b3);
548

549
    vst1_s16(out +  0, out0);
550
    vst1_s16(out +  4, out1);
551
    vst1_s16(out +  8, out2);
552
    vst1_s16(out + 12, out3);
553
  }
554
}
555
#undef LOAD_LANE_16b
556

557
//------------------------------------------------------------------------------
558
// Texture distortion
559
//
560
// We try to match the spectral content (weighted) between source and
561
// reconstructed samples.
562

563
// a 0123, b 0123
564
// a 4567, b 4567
565
// a 89ab, b 89ab
566
// a cdef, b cdef
567
//
568
// transpose
569
//
570
// a 048c, b 048c
571
// a 159d, b 159d
572
// a 26ae, b 26ae
573
// a 37bf, b 37bf
574
//
575
static WEBP_INLINE int16x8x4_t DistoTranspose4x4S16_NEON(int16x8x4_t q4_in) {
576
  const int16x8x2_t q2_tmp0 = vtrnq_s16(q4_in.val[0], q4_in.val[1]);
577
  const int16x8x2_t q2_tmp1 = vtrnq_s16(q4_in.val[2], q4_in.val[3]);
578
  const int32x4x2_t q2_tmp2 = vtrnq_s32(vreinterpretq_s32_s16(q2_tmp0.val[0]),
579
                                        vreinterpretq_s32_s16(q2_tmp1.val[0]));
580
  const int32x4x2_t q2_tmp3 = vtrnq_s32(vreinterpretq_s32_s16(q2_tmp0.val[1]),
581
                                        vreinterpretq_s32_s16(q2_tmp1.val[1]));
582
  q4_in.val[0] = vreinterpretq_s16_s32(q2_tmp2.val[0]);
583
  q4_in.val[2] = vreinterpretq_s16_s32(q2_tmp2.val[1]);
584
  q4_in.val[1] = vreinterpretq_s16_s32(q2_tmp3.val[0]);
585
  q4_in.val[3] = vreinterpretq_s16_s32(q2_tmp3.val[1]);
586
  return q4_in;
587
}
588

589
static WEBP_INLINE int16x8x4_t DistoHorizontalPass_NEON(
590
    const int16x8x4_t q4_in) {
591
  // {a0, a1} = {in[0] + in[2], in[1] + in[3]}
592
  // {a3, a2} = {in[0] - in[2], in[1] - in[3]}
593
  const int16x8_t q_a0 = vaddq_s16(q4_in.val[0], q4_in.val[2]);
594
  const int16x8_t q_a1 = vaddq_s16(q4_in.val[1], q4_in.val[3]);
595
  const int16x8_t q_a3 = vsubq_s16(q4_in.val[0], q4_in.val[2]);
596
  const int16x8_t q_a2 = vsubq_s16(q4_in.val[1], q4_in.val[3]);
597
  int16x8x4_t q4_out;
598
  // tmp[0] = a0 + a1
599
  // tmp[1] = a3 + a2
600
  // tmp[2] = a3 - a2
601
  // tmp[3] = a0 - a1
602
  INIT_VECTOR4(q4_out,
603
               vabsq_s16(vaddq_s16(q_a0, q_a1)),
604
               vabsq_s16(vaddq_s16(q_a3, q_a2)),
605
               vabdq_s16(q_a3, q_a2), vabdq_s16(q_a0, q_a1));
606
  return q4_out;
607
}
608

609
static WEBP_INLINE int16x8x4_t DistoVerticalPass_NEON(const uint8x8x4_t q4_in) {
610
  const int16x8_t q_a0 = vreinterpretq_s16_u16(vaddl_u8(q4_in.val[0],
611
                                                        q4_in.val[2]));
612
  const int16x8_t q_a1 = vreinterpretq_s16_u16(vaddl_u8(q4_in.val[1],
613
                                                        q4_in.val[3]));
614
  const int16x8_t q_a2 = vreinterpretq_s16_u16(vsubl_u8(q4_in.val[1],
615
                                                        q4_in.val[3]));
616
  const int16x8_t q_a3 = vreinterpretq_s16_u16(vsubl_u8(q4_in.val[0],
617
                                                        q4_in.val[2]));
618
  int16x8x4_t q4_out;
619

620
  INIT_VECTOR4(q4_out,
621
               vaddq_s16(q_a0, q_a1), vaddq_s16(q_a3, q_a2),
622
               vsubq_s16(q_a3, q_a2), vsubq_s16(q_a0, q_a1));
623
  return q4_out;
624
}
625

626
static WEBP_INLINE int16x4x4_t DistoLoadW_NEON(const uint16_t* w) {
627
  const uint16x8_t q_w07 = vld1q_u16(&w[0]);
628
  const uint16x8_t q_w8f = vld1q_u16(&w[8]);
629
  int16x4x4_t d4_w;
630
  INIT_VECTOR4(d4_w,
631
               vget_low_s16(vreinterpretq_s16_u16(q_w07)),
632
               vget_high_s16(vreinterpretq_s16_u16(q_w07)),
633
               vget_low_s16(vreinterpretq_s16_u16(q_w8f)),
634
               vget_high_s16(vreinterpretq_s16_u16(q_w8f)));
635
  return d4_w;
636
}
637

638
static WEBP_INLINE int32x2_t DistoSum_NEON(const int16x8x4_t q4_in,
639
                                           const int16x4x4_t d4_w) {
640
  int32x2_t d_sum;
641
  // sum += w[ 0] * abs(b0);
642
  // sum += w[ 4] * abs(b1);
643
  // sum += w[ 8] * abs(b2);
644
  // sum += w[12] * abs(b3);
645
  int32x4_t q_sum0 = vmull_s16(d4_w.val[0], vget_low_s16(q4_in.val[0]));
646
  int32x4_t q_sum1 = vmull_s16(d4_w.val[1], vget_low_s16(q4_in.val[1]));
647
  int32x4_t q_sum2 = vmull_s16(d4_w.val[2], vget_low_s16(q4_in.val[2]));
648
  int32x4_t q_sum3 = vmull_s16(d4_w.val[3], vget_low_s16(q4_in.val[3]));
649
  q_sum0 = vmlsl_s16(q_sum0, d4_w.val[0], vget_high_s16(q4_in.val[0]));
650
  q_sum1 = vmlsl_s16(q_sum1, d4_w.val[1], vget_high_s16(q4_in.val[1]));
651
  q_sum2 = vmlsl_s16(q_sum2, d4_w.val[2], vget_high_s16(q4_in.val[2]));
652
  q_sum3 = vmlsl_s16(q_sum3, d4_w.val[3], vget_high_s16(q4_in.val[3]));
653

654
  q_sum0 = vaddq_s32(q_sum0, q_sum1);
655
  q_sum2 = vaddq_s32(q_sum2, q_sum3);
656
  q_sum2 = vaddq_s32(q_sum0, q_sum2);
657
  d_sum = vpadd_s32(vget_low_s32(q_sum2), vget_high_s32(q_sum2));
658
  d_sum = vpadd_s32(d_sum, d_sum);
659
  return d_sum;
660
}
661

662
#define LOAD_LANE_32b(src, VALUE, LANE) \
663
    (VALUE) = vld1_lane_u32((const uint32_t*)(src), (VALUE), (LANE))
664

665
// Hadamard transform
666
// Returns the weighted sum of the absolute value of transformed coefficients.
667
// w[] contains a row-major 4 by 4 symmetric matrix.
668
static int Disto4x4_NEON(const uint8_t* WEBP_RESTRICT const a,
669
                         const uint8_t* WEBP_RESTRICT const b,
670
                         const uint16_t* WEBP_RESTRICT const w) {
671
  uint32x2_t d_in_ab_0123 = vdup_n_u32(0);
672
  uint32x2_t d_in_ab_4567 = vdup_n_u32(0);
673
  uint32x2_t d_in_ab_89ab = vdup_n_u32(0);
674
  uint32x2_t d_in_ab_cdef = vdup_n_u32(0);
675
  uint8x8x4_t d4_in;
676

677
  // load data a, b
678
  LOAD_LANE_32b(a + 0 * BPS, d_in_ab_0123, 0);
679
  LOAD_LANE_32b(a + 1 * BPS, d_in_ab_4567, 0);
680
  LOAD_LANE_32b(a + 2 * BPS, d_in_ab_89ab, 0);
681
  LOAD_LANE_32b(a + 3 * BPS, d_in_ab_cdef, 0);
682
  LOAD_LANE_32b(b + 0 * BPS, d_in_ab_0123, 1);
683
  LOAD_LANE_32b(b + 1 * BPS, d_in_ab_4567, 1);
684
  LOAD_LANE_32b(b + 2 * BPS, d_in_ab_89ab, 1);
685
  LOAD_LANE_32b(b + 3 * BPS, d_in_ab_cdef, 1);
686
  INIT_VECTOR4(d4_in,
687
               vreinterpret_u8_u32(d_in_ab_0123),
688
               vreinterpret_u8_u32(d_in_ab_4567),
689
               vreinterpret_u8_u32(d_in_ab_89ab),
690
               vreinterpret_u8_u32(d_in_ab_cdef));
691

692
  {
693
    // Vertical pass first to avoid a transpose (vertical and horizontal passes
694
    // are commutative because w/kWeightY is symmetric) and subsequent
695
    // transpose.
696
    const int16x8x4_t q4_v = DistoVerticalPass_NEON(d4_in);
697
    const int16x4x4_t d4_w = DistoLoadW_NEON(w);
698
    // horizontal pass
699
    const int16x8x4_t q4_t = DistoTranspose4x4S16_NEON(q4_v);
700
    const int16x8x4_t q4_h = DistoHorizontalPass_NEON(q4_t);
701
    int32x2_t d_sum = DistoSum_NEON(q4_h, d4_w);
702

703
    // abs(sum2 - sum1) >> 5
704
    d_sum = vabs_s32(d_sum);
705
    d_sum = vshr_n_s32(d_sum, 5);
706
    return vget_lane_s32(d_sum, 0);
707
  }
708
}
709
#undef LOAD_LANE_32b
710

711
static int Disto16x16_NEON(const uint8_t* WEBP_RESTRICT const a,
712
                           const uint8_t* WEBP_RESTRICT const b,
713
                           const uint16_t* WEBP_RESTRICT const w) {
714
  int D = 0;
715
  int x, y;
716
  for (y = 0; y < 16 * BPS; y += 4 * BPS) {
717
    for (x = 0; x < 16; x += 4) {
718
      D += Disto4x4_NEON(a + x + y, b + x + y, w);
719
    }
720
  }
721
  return D;
722
}
723

724
//------------------------------------------------------------------------------
725

726
static void CollectHistogram_NEON(const uint8_t* WEBP_RESTRICT ref,
727
                                  const uint8_t* WEBP_RESTRICT pred,
728
                                  int start_block, int end_block,
729
                                  VP8Histogram* WEBP_RESTRICT const histo) {
730
  const uint16x8_t max_coeff_thresh = vdupq_n_u16(MAX_COEFF_THRESH);
731
  int j;
732
  int distribution[MAX_COEFF_THRESH + 1] = { 0 };
733
  for (j = start_block; j < end_block; ++j) {
734
    int16_t out[16];
735
    FTransform_NEON(ref + VP8DspScan[j], pred + VP8DspScan[j], out);
736
    {
737
      int k;
738
      const int16x8_t a0 = vld1q_s16(out + 0);
739
      const int16x8_t b0 = vld1q_s16(out + 8);
740
      const uint16x8_t a1 = vreinterpretq_u16_s16(vabsq_s16(a0));
741
      const uint16x8_t b1 = vreinterpretq_u16_s16(vabsq_s16(b0));
742
      const uint16x8_t a2 = vshrq_n_u16(a1, 3);
743
      const uint16x8_t b2 = vshrq_n_u16(b1, 3);
744
      const uint16x8_t a3 = vminq_u16(a2, max_coeff_thresh);
745
      const uint16x8_t b3 = vminq_u16(b2, max_coeff_thresh);
746
      vst1q_s16(out + 0, vreinterpretq_s16_u16(a3));
747
      vst1q_s16(out + 8, vreinterpretq_s16_u16(b3));
748
      // Convert coefficients to bin.
749
      for (k = 0; k < 16; ++k) {
750
        ++distribution[out[k]];
751
      }
752
    }
753
  }
754
  VP8SetHistogramData(distribution, histo);
755
}
756

757
//------------------------------------------------------------------------------
758

759
static WEBP_INLINE void AccumulateSSE16_NEON(
760
    const uint8_t* WEBP_RESTRICT const a, const uint8_t* WEBP_RESTRICT const b,
761
    uint32x4_t* const sum) {
762
  const uint8x16_t a0 = vld1q_u8(a);
763
  const uint8x16_t b0 = vld1q_u8(b);
764
  const uint8x16_t abs_diff = vabdq_u8(a0, b0);
765
  const uint16x8_t prod1 = vmull_u8(vget_low_u8(abs_diff),
766
                                    vget_low_u8(abs_diff));
767
  const uint16x8_t prod2 = vmull_u8(vget_high_u8(abs_diff),
768
                                    vget_high_u8(abs_diff));
769
  /* pair-wise adds and widen */
770
  const uint32x4_t sum1 = vpaddlq_u16(prod1);
771
  const uint32x4_t sum2 = vpaddlq_u16(prod2);
772
  *sum = vaddq_u32(*sum, vaddq_u32(sum1, sum2));
773
}
774

775
// Horizontal sum of all four uint32_t values in 'sum'.
776
static int SumToInt_NEON(uint32x4_t sum) {
777
#if WEBP_AARCH64
778
  return (int)vaddvq_u32(sum);
779
#else
780
  const uint64x2_t sum2 = vpaddlq_u32(sum);
781
  const uint32x2_t sum3 = vadd_u32(vreinterpret_u32_u64(vget_low_u64(sum2)),
782
                                   vreinterpret_u32_u64(vget_high_u64(sum2)));
783
  return (int)vget_lane_u32(sum3, 0);
784
#endif
785
}
786

787
static int SSE16x16_NEON(const uint8_t* WEBP_RESTRICT a,
788
                         const uint8_t* WEBP_RESTRICT b) {
789
  uint32x4_t sum = vdupq_n_u32(0);
790
  int y;
791
  for (y = 0; y < 16; ++y) {
792
    AccumulateSSE16_NEON(a + y * BPS, b + y * BPS, &sum);
793
  }
794
  return SumToInt_NEON(sum);
795
}
796

797
static int SSE16x8_NEON(const uint8_t* WEBP_RESTRICT a,
798
                        const uint8_t* WEBP_RESTRICT b) {
799
  uint32x4_t sum = vdupq_n_u32(0);
800
  int y;
801
  for (y = 0; y < 8; ++y) {
802
    AccumulateSSE16_NEON(a + y * BPS, b + y * BPS, &sum);
803
  }
804
  return SumToInt_NEON(sum);
805
}
806

807
static int SSE8x8_NEON(const uint8_t* WEBP_RESTRICT a,
808
                       const uint8_t* WEBP_RESTRICT b) {
809
  uint32x4_t sum = vdupq_n_u32(0);
810
  int y;
811
  for (y = 0; y < 8; ++y) {
812
    const uint8x8_t a0 = vld1_u8(a + y * BPS);
813
    const uint8x8_t b0 = vld1_u8(b + y * BPS);
814
    const uint8x8_t abs_diff = vabd_u8(a0, b0);
815
    const uint16x8_t prod = vmull_u8(abs_diff, abs_diff);
816
    sum = vpadalq_u16(sum, prod);
817
  }
818
  return SumToInt_NEON(sum);
819
}
820

821
static int SSE4x4_NEON(const uint8_t* WEBP_RESTRICT a,
822
                       const uint8_t* WEBP_RESTRICT b) {
823
  const uint8x16_t a0 = Load4x4_NEON(a);
824
  const uint8x16_t b0 = Load4x4_NEON(b);
825
  const uint8x16_t abs_diff = vabdq_u8(a0, b0);
826
  const uint16x8_t prod1 = vmull_u8(vget_low_u8(abs_diff),
827
                                    vget_low_u8(abs_diff));
828
  const uint16x8_t prod2 = vmull_u8(vget_high_u8(abs_diff),
829
                                    vget_high_u8(abs_diff));
830
  /* pair-wise adds and widen */
831
  const uint32x4_t sum1 = vpaddlq_u16(prod1);
832
  const uint32x4_t sum2 = vpaddlq_u16(prod2);
833
  return SumToInt_NEON(vaddq_u32(sum1, sum2));
834
}
835

836
//------------------------------------------------------------------------------
837

838
// Compilation with gcc-4.6.x is problematic for now.
839
#if !defined(WORK_AROUND_GCC)
840

841
static int16x8_t Quantize_NEON(int16_t* WEBP_RESTRICT const in,
842
                               const VP8Matrix* WEBP_RESTRICT const mtx,
843
                               int offset) {
844
  const uint16x8_t sharp = vld1q_u16(&mtx->sharpen_[offset]);
845
  const uint16x8_t q = vld1q_u16(&mtx->q_[offset]);
846
  const uint16x8_t iq = vld1q_u16(&mtx->iq_[offset]);
847
  const uint32x4_t bias0 = vld1q_u32(&mtx->bias_[offset + 0]);
848
  const uint32x4_t bias1 = vld1q_u32(&mtx->bias_[offset + 4]);
849

850
  const int16x8_t a = vld1q_s16(in + offset);                // in
851
  const uint16x8_t b = vreinterpretq_u16_s16(vabsq_s16(a));  // coeff = abs(in)
852
  const int16x8_t sign = vshrq_n_s16(a, 15);                 // sign
853
  const uint16x8_t c = vaddq_u16(b, sharp);                  // + sharpen
854
  const uint32x4_t m0 = vmull_u16(vget_low_u16(c), vget_low_u16(iq));
855
  const uint32x4_t m1 = vmull_u16(vget_high_u16(c), vget_high_u16(iq));
856
  const uint32x4_t m2 = vhaddq_u32(m0, bias0);
857
  const uint32x4_t m3 = vhaddq_u32(m1, bias1);     // (coeff * iQ + bias) >> 1
858
  const uint16x8_t c0 = vcombine_u16(vshrn_n_u32(m2, 16),
859
                                     vshrn_n_u32(m3, 16));   // QFIX=17 = 16+1
860
  const uint16x8_t c1 = vminq_u16(c0, vdupq_n_u16(MAX_LEVEL));
861
  const int16x8_t c2 = veorq_s16(vreinterpretq_s16_u16(c1), sign);
862
  const int16x8_t c3 = vsubq_s16(c2, sign);                  // restore sign
863
  const int16x8_t c4 = vmulq_s16(c3, vreinterpretq_s16_u16(q));
864
  vst1q_s16(in + offset, c4);
865
  assert(QFIX == 17);  // this function can't work as is if QFIX != 16+1
866
  return c3;
867
}
868

869
static const uint8_t kShuffles[4][8] = {
870
  { 0,   1,  2,  3,  8,  9, 16, 17 },
871
  { 10, 11,  4,  5,  6,  7, 12, 13 },
872
  { 18, 19, 24, 25, 26, 27, 20, 21 },
873
  { 14, 15, 22, 23, 28, 29, 30, 31 }
874
};
875

876
static int QuantizeBlock_NEON(int16_t in[16], int16_t out[16],
877
                              const VP8Matrix* WEBP_RESTRICT const mtx) {
878
  const int16x8_t out0 = Quantize_NEON(in, mtx, 0);
879
  const int16x8_t out1 = Quantize_NEON(in, mtx, 8);
880
  uint8x8x4_t shuffles;
881
  // vtbl?_u8 are marked unavailable for iOS arm64 with Xcode < 6.3, use
882
  // non-standard versions there.
883
#if defined(__APPLE__) && WEBP_AARCH64 && \
884
    defined(__apple_build_version__) && (__apple_build_version__< 6020037)
885
  uint8x16x2_t all_out;
886
  INIT_VECTOR2(all_out, vreinterpretq_u8_s16(out0), vreinterpretq_u8_s16(out1));
887
  INIT_VECTOR4(shuffles,
888
               vtbl2q_u8(all_out, vld1_u8(kShuffles[0])),
889
               vtbl2q_u8(all_out, vld1_u8(kShuffles[1])),
890
               vtbl2q_u8(all_out, vld1_u8(kShuffles[2])),
891
               vtbl2q_u8(all_out, vld1_u8(kShuffles[3])));
892
#else
893
  uint8x8x4_t all_out;
894
  INIT_VECTOR4(all_out,
895
               vreinterpret_u8_s16(vget_low_s16(out0)),
896
               vreinterpret_u8_s16(vget_high_s16(out0)),
897
               vreinterpret_u8_s16(vget_low_s16(out1)),
898
               vreinterpret_u8_s16(vget_high_s16(out1)));
899
  INIT_VECTOR4(shuffles,
900
               vtbl4_u8(all_out, vld1_u8(kShuffles[0])),
901
               vtbl4_u8(all_out, vld1_u8(kShuffles[1])),
902
               vtbl4_u8(all_out, vld1_u8(kShuffles[2])),
903
               vtbl4_u8(all_out, vld1_u8(kShuffles[3])));
904
#endif
905
  // Zigzag reordering
906
  vst1_u8((uint8_t*)(out +  0), shuffles.val[0]);
907
  vst1_u8((uint8_t*)(out +  4), shuffles.val[1]);
908
  vst1_u8((uint8_t*)(out +  8), shuffles.val[2]);
909
  vst1_u8((uint8_t*)(out + 12), shuffles.val[3]);
910
  // test zeros
911
  if (*(uint64_t*)(out +  0) != 0) return 1;
912
  if (*(uint64_t*)(out +  4) != 0) return 1;
913
  if (*(uint64_t*)(out +  8) != 0) return 1;
914
  if (*(uint64_t*)(out + 12) != 0) return 1;
915
  return 0;
916
}
917

918
static int Quantize2Blocks_NEON(int16_t in[32], int16_t out[32],
919
                                const VP8Matrix* WEBP_RESTRICT const mtx) {
920
  int nz;
921
  nz  = QuantizeBlock_NEON(in + 0 * 16, out + 0 * 16, mtx) << 0;
922
  nz |= QuantizeBlock_NEON(in + 1 * 16, out + 1 * 16, mtx) << 1;
923
  return nz;
924
}
925

926
#endif   // !WORK_AROUND_GCC
927

928
#if WEBP_AARCH64
929

930
#if BPS == 32
931
#define DC4_VE4_HE4_TM4_NEON(dst, tbl, res, lane)                              \
932
  do {                                                                         \
933
    uint8x16_t r;                                                              \
934
    r = vqtbl2q_u8(qcombined, tbl);                                            \
935
    r = vreinterpretq_u8_u32(                                                  \
936
        vsetq_lane_u32(vget_lane_u32(vreinterpret_u32_u8(res), lane),          \
937
                       vreinterpretq_u32_u8(r), 1));                           \
938
    vst1q_u8(dst, r);                                                          \
939
  } while (0)
940

941
#define RD4_VR4_LD4_VL4_NEON(dst, tbl)                                         \
942
  do {                                                                         \
943
    uint8x16_t r;                                                              \
944
    r = vqtbl2q_u8(qcombined, tbl);                                            \
945
    vst1q_u8(dst, r);                                                          \
946
  } while (0)
947

948
static void Intra4Preds_NEON(uint8_t* WEBP_RESTRICT dst,
949
                             const uint8_t* WEBP_RESTRICT top) {
950
  // 0   1   2   3   4   5   6   7   8   9  10  11  12  13
951
  //     L   K   J   I   X   A   B   C   D   E   F   G   H
952
  //    -5  -4  -3  -2  -1   0   1   2   3   4   5   6   7
953
  static const uint8_t kLookupTbl1[64] = {
954
    0,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 12, 12,
955
    3,  3,  3,  3,  2,  2,  2,  2,  1,  1,  1,  1,  0,  0,  0,  0,
956
    4, 20, 21, 22,  3, 18,  2, 17,  3, 19,  4, 20,  2, 17,  1, 16,
957
    2, 18,  3, 19,  1, 16, 31, 31,  1, 17,  2, 18, 31, 31, 31, 31
958
  };
959

960
  static const uint8_t kLookupTbl2[64] = {
961
    20, 21, 22, 23,  5,  6,  7,  8, 22, 23, 24, 25,  6,  7,  8,  9,
962
    19, 20, 21, 22, 20, 21, 22, 23, 23, 24, 25, 26, 22, 23, 24, 25,
963
    18, 19, 20, 21, 19,  5,  6,  7, 24, 25, 26, 27,  7,  8,  9, 26,
964
    17, 18, 19, 20, 18, 20, 21, 22, 25, 26, 27, 28, 23, 24, 25, 27
965
  };
966

967
  static const uint8_t kLookupTbl3[64] = {
968
    30, 30, 30, 30,  0,  0,  0,  0, 21, 22, 23, 24, 19, 19, 19, 19,
969
    30, 30, 30, 30,  0,  0,  0,  0, 21, 22, 23, 24, 18, 18, 18, 18,
970
    30, 30, 30, 30,  0,  0,  0,  0, 21, 22, 23, 24, 17, 17, 17, 17,
971
    30, 30, 30, 30,  0,  0,  0,  0, 21, 22, 23, 24, 16, 16, 16, 16
972
  };
973

974
  const uint8x16x4_t lookup_avgs1 = vld1q_u8_x4(kLookupTbl1);
975
  const uint8x16x4_t lookup_avgs2 = vld1q_u8_x4(kLookupTbl2);
976
  const uint8x16x4_t lookup_avgs3 = vld1q_u8_x4(kLookupTbl3);
977

978
  const uint8x16_t preload = vld1q_u8(top - 5);
979
  uint8x16x2_t qcombined;
980
  uint8x16_t result0, result1;
981

982
  uint8x16_t a = vqtbl1q_u8(preload, lookup_avgs1.val[0]);
983
  uint8x16_t b = preload;
984
  uint8x16_t c = vextq_u8(a, a, 2);
985

986
  uint8x16_t avg3_all = vrhaddq_u8(vhaddq_u8(a, c), b);
987
  uint8x16_t avg2_all = vrhaddq_u8(a, b);
988

989
  uint8x8_t preload_x8, sub_a, sub_c;
990
  uint8_t result_u8;
991
  uint8x8_t res_lo, res_hi;
992
  uint8x16_t full_b;
993
  uint16x8_t sub, sum_lo, sum_hi;
994

995
  preload_x8 = vget_low_u8(c);
996
  preload_x8 = vset_lane_u8(vgetq_lane_u8(preload, 0), preload_x8, 3);
997

998
  result_u8 = (vaddlv_u8(preload_x8) + 4) >> 3;
999

1000
  avg3_all = vsetq_lane_u8(vgetq_lane_u8(preload, 0), avg3_all, 15);
1001
  avg3_all = vsetq_lane_u8(result_u8, avg3_all, 14);
1002

1003
  qcombined.val[0] = avg2_all;
1004
  qcombined.val[1] = avg3_all;
1005

1006
  sub_a = vdup_laneq_u8(preload, 4);
1007

1008
  // preload = {a,b,c,d,...} => full_b = {d,d,d,d,c,c,c,c,b,b,b,b,a,a,a,a}
1009
  full_b = vqtbl1q_u8(preload, lookup_avgs1.val[1]);
1010
  // preload = {a,b,c,d,...} => sub_c = {a,b,c,d,a,b,c,d,a,b,c,d,a,b,c,d}
1011
  sub_c = vreinterpret_u8_u32(vdup_n_u32(
1012
      vgetq_lane_u32(vreinterpretq_u32_u8(vextq_u8(preload, preload, 5)), 0)));
1013

1014
  sub = vsubl_u8(sub_c, sub_a);
1015
  sum_lo = vaddw_u8(sub, vget_low_u8(full_b));
1016
  res_lo = vqmovun_s16(vreinterpretq_s16_u16(sum_lo));
1017

1018
  sum_hi = vaddw_u8(sub, vget_high_u8(full_b));
1019
  res_hi = vqmovun_s16(vreinterpretq_s16_u16(sum_hi));
1020

1021
  // DC4, VE4, HE4, TM4
1022
  DC4_VE4_HE4_TM4_NEON(dst + I4DC4 + BPS * 0, lookup_avgs3.val[0], res_lo, 0);
1023
  DC4_VE4_HE4_TM4_NEON(dst + I4DC4 + BPS * 1, lookup_avgs3.val[1], res_lo, 1);
1024
  DC4_VE4_HE4_TM4_NEON(dst + I4DC4 + BPS * 2, lookup_avgs3.val[2], res_hi, 0);
1025
  DC4_VE4_HE4_TM4_NEON(dst + I4DC4 + BPS * 3, lookup_avgs3.val[3], res_hi, 1);
1026

1027
  // RD4, VR4, LD4, VL4
1028
  RD4_VR4_LD4_VL4_NEON(dst + I4RD4 + BPS * 0, lookup_avgs2.val[0]);
1029
  RD4_VR4_LD4_VL4_NEON(dst + I4RD4 + BPS * 1, lookup_avgs2.val[1]);
1030
  RD4_VR4_LD4_VL4_NEON(dst + I4RD4 + BPS * 2, lookup_avgs2.val[2]);
1031
  RD4_VR4_LD4_VL4_NEON(dst + I4RD4 + BPS * 3, lookup_avgs2.val[3]);
1032

1033
  // HD4, HU4
1034
  result0 = vqtbl2q_u8(qcombined, lookup_avgs1.val[2]);
1035
  result1 = vqtbl2q_u8(qcombined, lookup_avgs1.val[3]);
1036

1037
  vst1_u8(dst + I4HD4 + BPS * 0, vget_low_u8(result0));
1038
  vst1_u8(dst + I4HD4 + BPS * 1, vget_high_u8(result0));
1039
  vst1_u8(dst + I4HD4 + BPS * 2, vget_low_u8(result1));
1040
  vst1_u8(dst + I4HD4 + BPS * 3, vget_high_u8(result1));
1041
}
1042
#endif  // BPS == 32
1043

1044
static WEBP_INLINE void Fill_NEON(uint8_t* dst, const uint8_t value) {
1045
  uint8x16_t a = vdupq_n_u8(value);
1046
  int i;
1047
  for (i = 0; i < 16; i++) {
1048
    vst1q_u8(dst + BPS * i, a);
1049
  }
1050
}
1051

1052
static WEBP_INLINE void Fill16_NEON(uint8_t* dst, const uint8_t* src) {
1053
  uint8x16_t a = vld1q_u8(src);
1054
  int i;
1055
  for (i = 0; i < 16; i++) {
1056
    vst1q_u8(dst + BPS * i, a);
1057
  }
1058
}
1059

1060
static WEBP_INLINE void HorizontalPred16_NEON(uint8_t* dst,
1061
                                              const uint8_t* left) {
1062
  uint8x16_t a;
1063

1064
  if (left == NULL) {
1065
    Fill_NEON(dst, 129);
1066
    return;
1067
  }
1068

1069
  a = vld1q_u8(left + 0);
1070
  vst1q_u8(dst + BPS * 0, vdupq_laneq_u8(a, 0));
1071
  vst1q_u8(dst + BPS * 1, vdupq_laneq_u8(a, 1));
1072
  vst1q_u8(dst + BPS * 2, vdupq_laneq_u8(a, 2));
1073
  vst1q_u8(dst + BPS * 3, vdupq_laneq_u8(a, 3));
1074
  vst1q_u8(dst + BPS * 4, vdupq_laneq_u8(a, 4));
1075
  vst1q_u8(dst + BPS * 5, vdupq_laneq_u8(a, 5));
1076
  vst1q_u8(dst + BPS * 6, vdupq_laneq_u8(a, 6));
1077
  vst1q_u8(dst + BPS * 7, vdupq_laneq_u8(a, 7));
1078
  vst1q_u8(dst + BPS * 8, vdupq_laneq_u8(a, 8));
1079
  vst1q_u8(dst + BPS * 9, vdupq_laneq_u8(a, 9));
1080
  vst1q_u8(dst + BPS * 10, vdupq_laneq_u8(a, 10));
1081
  vst1q_u8(dst + BPS * 11, vdupq_laneq_u8(a, 11));
1082
  vst1q_u8(dst + BPS * 12, vdupq_laneq_u8(a, 12));
1083
  vst1q_u8(dst + BPS * 13, vdupq_laneq_u8(a, 13));
1084
  vst1q_u8(dst + BPS * 14, vdupq_laneq_u8(a, 14));
1085
  vst1q_u8(dst + BPS * 15, vdupq_laneq_u8(a, 15));
1086
}
1087

1088
static WEBP_INLINE void VerticalPred16_NEON(uint8_t* dst, const uint8_t* top) {
1089
  if (top != NULL) {
1090
    Fill16_NEON(dst, top);
1091
  } else {
1092
    Fill_NEON(dst, 127);
1093
  }
1094
}
1095

1096
static WEBP_INLINE void DCMode_NEON(uint8_t* dst, const uint8_t* left,
1097
                                    const uint8_t* top) {
1098
  uint8_t s;
1099

1100
  if (top != NULL) {
1101
    uint16_t dc;
1102
    dc = vaddlvq_u8(vld1q_u8(top));
1103
    if (left != NULL) {
1104
      // top and left present.
1105
      dc += vaddlvq_u8(vld1q_u8(left));
1106
      s = vqrshrnh_n_u16(dc, 5);
1107
    } else {
1108
      // top but no left.
1109
      s = vqrshrnh_n_u16(dc, 4);
1110
    }
1111
  } else {
1112
    if (left != NULL) {
1113
      uint16_t dc;
1114
      // left but no top.
1115
      dc = vaddlvq_u8(vld1q_u8(left));
1116
      s = vqrshrnh_n_u16(dc, 4);
1117
    } else {
1118
      // No top, no left, nothing.
1119
      s = 0x80;
1120
    }
1121
  }
1122
  Fill_NEON(dst, s);
1123
}
1124

1125
static WEBP_INLINE void TrueMotionHelper_NEON(uint8_t* dst,
1126
                                              const uint8x8_t outer,
1127
                                              const uint8x8x2_t inner,
1128
                                              const uint16x8_t a, int i,
1129
                                              const int n) {
1130
  uint8x8_t d1, d2;
1131
  uint16x8_t r1, r2;
1132

1133
  r1 = vaddl_u8(outer, inner.val[0]);
1134
  r1 = vqsubq_u16(r1, a);
1135
  d1 = vqmovun_s16(vreinterpretq_s16_u16(r1));
1136
  r2 = vaddl_u8(outer, inner.val[1]);
1137
  r2 = vqsubq_u16(r2, a);
1138
  d2 = vqmovun_s16(vreinterpretq_s16_u16(r2));
1139
  vst1_u8(dst + BPS * (i * 4 + n), d1);
1140
  vst1_u8(dst + BPS * (i * 4 + n) + 8, d2);
1141
}
1142

1143
static WEBP_INLINE void TrueMotion_NEON(uint8_t* dst, const uint8_t* left,
1144
                                        const uint8_t* top) {
1145
  int i;
1146
  uint16x8_t a;
1147
  uint8x8x2_t inner;
1148

1149
  if (left == NULL) {
1150
    // True motion without left samples (hence: with default 129 value) is
1151
    // equivalent to VE prediction where you just copy the top samples.
1152
    // Note that if top samples are not available, the default value is then
1153
    // 129, and not 127 as in the VerticalPred case.
1154
    if (top != NULL) {
1155
      VerticalPred16_NEON(dst, top);
1156
    } else {
1157
      Fill_NEON(dst, 129);
1158
    }
1159
    return;
1160
  }
1161

1162
  // left is not NULL.
1163
  if (top == NULL) {
1164
    HorizontalPred16_NEON(dst, left);
1165
    return;
1166
  }
1167

1168
  // Neither left nor top are NULL.
1169
  a = vdupq_n_u16(left[-1]);
1170
  inner = vld1_u8_x2(top);
1171

1172
  for (i = 0; i < 4; i++) {
1173
    const uint8x8x4_t outer = vld4_dup_u8(&left[i * 4]);
1174

1175
    TrueMotionHelper_NEON(dst, outer.val[0], inner, a, i, 0);
1176
    TrueMotionHelper_NEON(dst, outer.val[1], inner, a, i, 1);
1177
    TrueMotionHelper_NEON(dst, outer.val[2], inner, a, i, 2);
1178
    TrueMotionHelper_NEON(dst, outer.val[3], inner, a, i, 3);
1179
  }
1180
}
1181

1182
static void Intra16Preds_NEON(uint8_t* WEBP_RESTRICT dst,
1183
                              const uint8_t* WEBP_RESTRICT left,
1184
                              const uint8_t* WEBP_RESTRICT top) {
1185
  DCMode_NEON(I16DC16 + dst, left, top);
1186
  VerticalPred16_NEON(I16VE16 + dst, top);
1187
  HorizontalPred16_NEON(I16HE16 + dst, left);
1188
  TrueMotion_NEON(I16TM16 + dst, left, top);
1189
}
1190

1191
#endif // WEBP_AARCH64
1192

1193
//------------------------------------------------------------------------------
1194
// Entry point
1195

1196
extern void VP8EncDspInitNEON(void);
1197

1198
WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspInitNEON(void) {
1199
  VP8ITransform = ITransform_NEON;
1200
  VP8FTransform = FTransform_NEON;
1201

1202
  VP8FTransformWHT = FTransformWHT_NEON;
1203

1204
  VP8TDisto4x4 = Disto4x4_NEON;
1205
  VP8TDisto16x16 = Disto16x16_NEON;
1206
  VP8CollectHistogram = CollectHistogram_NEON;
1207

1208
  VP8SSE16x16 = SSE16x16_NEON;
1209
  VP8SSE16x8 = SSE16x8_NEON;
1210
  VP8SSE8x8 = SSE8x8_NEON;
1211
  VP8SSE4x4 = SSE4x4_NEON;
1212

1213
#if WEBP_AARCH64
1214
#if BPS == 32
1215
  VP8EncPredLuma4 = Intra4Preds_NEON;
1216
#endif
1217
  VP8EncPredLuma16 = Intra16Preds_NEON;
1218
#endif
1219

1220
#if !defined(WORK_AROUND_GCC)
1221
  VP8EncQuantizeBlock = QuantizeBlock_NEON;
1222
  VP8EncQuantize2Blocks = Quantize2Blocks_NEON;
1223
  VP8EncQuantizeBlockWHT = QuantizeBlock_NEON;
1224
#endif
1225
}
1226

1227
#else  // !WEBP_USE_NEON
1228

1229
WEBP_DSP_INIT_STUB(VP8EncDspInitNEON)
1230

1231
#endif  // WEBP_USE_NEON
1232

1233
Product

Resources

Company