Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
godotengine
GitHub Repository: godotengine/godot
Path: blob/master/thirdparty/libwebp/src/dsp/lossless_sse2.c
9918 views
1
// Copyright 2014 Google Inc. All Rights Reserved.
2
//
3
// Use of this source code is governed by a BSD-style license
4
// that can be found in the COPYING file in the root of the source
5
// tree. An additional intellectual property rights grant can be found
6
// in the file PATENTS. All contributing project authors may
7
// be found in the AUTHORS file in the root of the source tree.
8
// -----------------------------------------------------------------------------
9
//
10
// SSE2 variant of methods for lossless decoder
11
//
12
// Author: Skal ([email protected])
13
14
#include "src/dsp/dsp.h"
15
16
#if defined(WEBP_USE_SSE2)
17
18
#include "src/dsp/common_sse2.h"
19
#include "src/dsp/lossless.h"
20
#include "src/dsp/lossless_common.h"
21
#include <emmintrin.h>
22
23
//------------------------------------------------------------------------------
24
// Predictor Transform
25
26
static WEBP_INLINE uint32_t ClampedAddSubtractFull_SSE2(uint32_t c0,
27
uint32_t c1,
28
uint32_t c2) {
29
const __m128i zero = _mm_setzero_si128();
30
const __m128i C0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128((int)c0), zero);
31
const __m128i C1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128((int)c1), zero);
32
const __m128i C2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128((int)c2), zero);
33
const __m128i V1 = _mm_add_epi16(C0, C1);
34
const __m128i V2 = _mm_sub_epi16(V1, C2);
35
const __m128i b = _mm_packus_epi16(V2, V2);
36
return (uint32_t)_mm_cvtsi128_si32(b);
37
}
38
39
static WEBP_INLINE uint32_t ClampedAddSubtractHalf_SSE2(uint32_t c0,
40
uint32_t c1,
41
uint32_t c2) {
42
const __m128i zero = _mm_setzero_si128();
43
const __m128i C0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128((int)c0), zero);
44
const __m128i C1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128((int)c1), zero);
45
const __m128i B0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128((int)c2), zero);
46
const __m128i avg = _mm_add_epi16(C1, C0);
47
const __m128i A0 = _mm_srli_epi16(avg, 1);
48
const __m128i A1 = _mm_sub_epi16(A0, B0);
49
const __m128i BgtA = _mm_cmpgt_epi16(B0, A0);
50
const __m128i A2 = _mm_sub_epi16(A1, BgtA);
51
const __m128i A3 = _mm_srai_epi16(A2, 1);
52
const __m128i A4 = _mm_add_epi16(A0, A3);
53
const __m128i A5 = _mm_packus_epi16(A4, A4);
54
return (uint32_t)_mm_cvtsi128_si32(A5);
55
}
56
57
static WEBP_INLINE uint32_t Select_SSE2(uint32_t a, uint32_t b, uint32_t c) {
58
int pa_minus_pb;
59
const __m128i zero = _mm_setzero_si128();
60
const __m128i A0 = _mm_cvtsi32_si128((int)a);
61
const __m128i B0 = _mm_cvtsi32_si128((int)b);
62
const __m128i C0 = _mm_cvtsi32_si128((int)c);
63
const __m128i AC0 = _mm_subs_epu8(A0, C0);
64
const __m128i CA0 = _mm_subs_epu8(C0, A0);
65
const __m128i BC0 = _mm_subs_epu8(B0, C0);
66
const __m128i CB0 = _mm_subs_epu8(C0, B0);
67
const __m128i AC = _mm_or_si128(AC0, CA0);
68
const __m128i BC = _mm_or_si128(BC0, CB0);
69
const __m128i pa = _mm_unpacklo_epi8(AC, zero); // |a - c|
70
const __m128i pb = _mm_unpacklo_epi8(BC, zero); // |b - c|
71
const __m128i diff = _mm_sub_epi16(pb, pa);
72
{
73
int16_t out[8];
74
_mm_storeu_si128((__m128i*)out, diff);
75
pa_minus_pb = out[0] + out[1] + out[2] + out[3];
76
}
77
return (pa_minus_pb <= 0) ? a : b;
78
}
79
80
static WEBP_INLINE void Average2_m128i(const __m128i* const a0,
81
const __m128i* const a1,
82
__m128i* const avg) {
83
// (a + b) >> 1 = ((a + b + 1) >> 1) - ((a ^ b) & 1)
84
const __m128i ones = _mm_set1_epi8(1);
85
const __m128i avg1 = _mm_avg_epu8(*a0, *a1);
86
const __m128i one = _mm_and_si128(_mm_xor_si128(*a0, *a1), ones);
87
*avg = _mm_sub_epi8(avg1, one);
88
}
89
90
static WEBP_INLINE void Average2_uint32_SSE2(const uint32_t a0,
91
const uint32_t a1,
92
__m128i* const avg) {
93
// (a + b) >> 1 = ((a + b + 1) >> 1) - ((a ^ b) & 1)
94
const __m128i ones = _mm_set1_epi8(1);
95
const __m128i A0 = _mm_cvtsi32_si128((int)a0);
96
const __m128i A1 = _mm_cvtsi32_si128((int)a1);
97
const __m128i avg1 = _mm_avg_epu8(A0, A1);
98
const __m128i one = _mm_and_si128(_mm_xor_si128(A0, A1), ones);
99
*avg = _mm_sub_epi8(avg1, one);
100
}
101
102
static WEBP_INLINE __m128i Average2_uint32_16_SSE2(uint32_t a0, uint32_t a1) {
103
const __m128i zero = _mm_setzero_si128();
104
const __m128i A0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128((int)a0), zero);
105
const __m128i A1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128((int)a1), zero);
106
const __m128i sum = _mm_add_epi16(A1, A0);
107
return _mm_srli_epi16(sum, 1);
108
}
109
110
static WEBP_INLINE uint32_t Average2_SSE2(uint32_t a0, uint32_t a1) {
111
__m128i output;
112
Average2_uint32_SSE2(a0, a1, &output);
113
return (uint32_t)_mm_cvtsi128_si32(output);
114
}
115
116
static WEBP_INLINE uint32_t Average3_SSE2(uint32_t a0, uint32_t a1,
117
uint32_t a2) {
118
const __m128i zero = _mm_setzero_si128();
119
const __m128i avg1 = Average2_uint32_16_SSE2(a0, a2);
120
const __m128i A1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128((int)a1), zero);
121
const __m128i sum = _mm_add_epi16(avg1, A1);
122
const __m128i avg2 = _mm_srli_epi16(sum, 1);
123
const __m128i A2 = _mm_packus_epi16(avg2, avg2);
124
return (uint32_t)_mm_cvtsi128_si32(A2);
125
}
126
127
static WEBP_INLINE uint32_t Average4_SSE2(uint32_t a0, uint32_t a1,
128
uint32_t a2, uint32_t a3) {
129
const __m128i avg1 = Average2_uint32_16_SSE2(a0, a1);
130
const __m128i avg2 = Average2_uint32_16_SSE2(a2, a3);
131
const __m128i sum = _mm_add_epi16(avg2, avg1);
132
const __m128i avg3 = _mm_srli_epi16(sum, 1);
133
const __m128i A0 = _mm_packus_epi16(avg3, avg3);
134
return (uint32_t)_mm_cvtsi128_si32(A0);
135
}
136
137
static uint32_t Predictor5_SSE2(const uint32_t* const left,
138
const uint32_t* const top) {
139
const uint32_t pred = Average3_SSE2(*left, top[0], top[1]);
140
return pred;
141
}
142
static uint32_t Predictor6_SSE2(const uint32_t* const left,
143
const uint32_t* const top) {
144
const uint32_t pred = Average2_SSE2(*left, top[-1]);
145
return pred;
146
}
147
static uint32_t Predictor7_SSE2(const uint32_t* const left,
148
const uint32_t* const top) {
149
const uint32_t pred = Average2_SSE2(*left, top[0]);
150
return pred;
151
}
152
static uint32_t Predictor8_SSE2(const uint32_t* const left,
153
const uint32_t* const top) {
154
const uint32_t pred = Average2_SSE2(top[-1], top[0]);
155
(void)left;
156
return pred;
157
}
158
static uint32_t Predictor9_SSE2(const uint32_t* const left,
159
const uint32_t* const top) {
160
const uint32_t pred = Average2_SSE2(top[0], top[1]);
161
(void)left;
162
return pred;
163
}
164
static uint32_t Predictor10_SSE2(const uint32_t* const left,
165
const uint32_t* const top) {
166
const uint32_t pred = Average4_SSE2(*left, top[-1], top[0], top[1]);
167
return pred;
168
}
169
static uint32_t Predictor11_SSE2(const uint32_t* const left,
170
const uint32_t* const top) {
171
const uint32_t pred = Select_SSE2(top[0], *left, top[-1]);
172
return pred;
173
}
174
static uint32_t Predictor12_SSE2(const uint32_t* const left,
175
const uint32_t* const top) {
176
const uint32_t pred = ClampedAddSubtractFull_SSE2(*left, top[0], top[-1]);
177
return pred;
178
}
179
static uint32_t Predictor13_SSE2(const uint32_t* const left,
180
const uint32_t* const top) {
181
const uint32_t pred = ClampedAddSubtractHalf_SSE2(*left, top[0], top[-1]);
182
return pred;
183
}
184
185
// Batch versions of those functions.
186
187
// Predictor0: ARGB_BLACK.
188
static void PredictorAdd0_SSE2(const uint32_t* in, const uint32_t* upper,
189
int num_pixels, uint32_t* WEBP_RESTRICT out) {
190
int i;
191
const __m128i black = _mm_set1_epi32((int)ARGB_BLACK);
192
for (i = 0; i + 4 <= num_pixels; i += 4) {
193
const __m128i src = _mm_loadu_si128((const __m128i*)&in[i]);
194
const __m128i res = _mm_add_epi8(src, black);
195
_mm_storeu_si128((__m128i*)&out[i], res);
196
}
197
if (i != num_pixels) {
198
VP8LPredictorsAdd_C[0](in + i, NULL, num_pixels - i, out + i);
199
}
200
(void)upper;
201
}
202
203
// Predictor1: left.
204
static void PredictorAdd1_SSE2(const uint32_t* in, const uint32_t* upper,
205
int num_pixels, uint32_t* WEBP_RESTRICT out) {
206
int i;
207
__m128i prev = _mm_set1_epi32((int)out[-1]);
208
for (i = 0; i + 4 <= num_pixels; i += 4) {
209
// a | b | c | d
210
const __m128i src = _mm_loadu_si128((const __m128i*)&in[i]);
211
// 0 | a | b | c
212
const __m128i shift0 = _mm_slli_si128(src, 4);
213
// a | a + b | b + c | c + d
214
const __m128i sum0 = _mm_add_epi8(src, shift0);
215
// 0 | 0 | a | a + b
216
const __m128i shift1 = _mm_slli_si128(sum0, 8);
217
// a | a + b | a + b + c | a + b + c + d
218
const __m128i sum1 = _mm_add_epi8(sum0, shift1);
219
const __m128i res = _mm_add_epi8(sum1, prev);
220
_mm_storeu_si128((__m128i*)&out[i], res);
221
// replicate prev output on the four lanes
222
prev = _mm_shuffle_epi32(res, (3 << 0) | (3 << 2) | (3 << 4) | (3 << 6));
223
}
224
if (i != num_pixels) {
225
VP8LPredictorsAdd_C[1](in + i, upper + i, num_pixels - i, out + i);
226
}
227
}
228
229
// Macro that adds 32-bit integers from IN using mod 256 arithmetic
230
// per 8 bit channel.
231
#define GENERATE_PREDICTOR_1(X, IN) \
232
static void PredictorAdd##X##_SSE2(const uint32_t* in, const uint32_t* upper, \
233
int num_pixels, \
234
uint32_t* WEBP_RESTRICT out) { \
235
int i; \
236
for (i = 0; i + 4 <= num_pixels; i += 4) { \
237
const __m128i src = _mm_loadu_si128((const __m128i*)&in[i]); \
238
const __m128i other = _mm_loadu_si128((const __m128i*)&(IN)); \
239
const __m128i res = _mm_add_epi8(src, other); \
240
_mm_storeu_si128((__m128i*)&out[i], res); \
241
} \
242
if (i != num_pixels) { \
243
VP8LPredictorsAdd_C[(X)](in + i, upper + i, num_pixels - i, out + i); \
244
} \
245
}
246
247
// Predictor2: Top.
248
GENERATE_PREDICTOR_1(2, upper[i])
249
// Predictor3: Top-right.
250
GENERATE_PREDICTOR_1(3, upper[i + 1])
251
// Predictor4: Top-left.
252
GENERATE_PREDICTOR_1(4, upper[i - 1])
253
#undef GENERATE_PREDICTOR_1
254
255
// Due to averages with integers, values cannot be accumulated in parallel for
256
// predictors 5 to 7.
257
GENERATE_PREDICTOR_ADD(Predictor5_SSE2, PredictorAdd5_SSE2)
258
GENERATE_PREDICTOR_ADD(Predictor6_SSE2, PredictorAdd6_SSE2)
259
GENERATE_PREDICTOR_ADD(Predictor7_SSE2, PredictorAdd7_SSE2)
260
261
#define GENERATE_PREDICTOR_2(X, IN) \
262
static void PredictorAdd##X##_SSE2(const uint32_t* in, const uint32_t* upper, \
263
int num_pixels, \
264
uint32_t* WEBP_RESTRICT out) { \
265
int i; \
266
for (i = 0; i + 4 <= num_pixels; i += 4) { \
267
const __m128i Tother = _mm_loadu_si128((const __m128i*)&(IN)); \
268
const __m128i T = _mm_loadu_si128((const __m128i*)&upper[i]); \
269
const __m128i src = _mm_loadu_si128((const __m128i*)&in[i]); \
270
__m128i avg, res; \
271
Average2_m128i(&T, &Tother, &avg); \
272
res = _mm_add_epi8(avg, src); \
273
_mm_storeu_si128((__m128i*)&out[i], res); \
274
} \
275
if (i != num_pixels) { \
276
VP8LPredictorsAdd_C[(X)](in + i, upper + i, num_pixels - i, out + i); \
277
} \
278
}
279
// Predictor8: average TL T.
280
GENERATE_PREDICTOR_2(8, upper[i - 1])
281
// Predictor9: average T TR.
282
GENERATE_PREDICTOR_2(9, upper[i + 1])
283
#undef GENERATE_PREDICTOR_2
284
285
// Predictor10: average of (average of (L,TL), average of (T, TR)).
286
#define DO_PRED10(OUT) do { \
287
__m128i avgLTL, avg; \
288
Average2_m128i(&L, &TL, &avgLTL); \
289
Average2_m128i(&avgTTR, &avgLTL, &avg); \
290
L = _mm_add_epi8(avg, src); \
291
out[i + (OUT)] = (uint32_t)_mm_cvtsi128_si32(L); \
292
} while (0)
293
294
#define DO_PRED10_SHIFT do { \
295
/* Rotate the pre-computed values for the next iteration.*/ \
296
avgTTR = _mm_srli_si128(avgTTR, 4); \
297
TL = _mm_srli_si128(TL, 4); \
298
src = _mm_srli_si128(src, 4); \
299
} while (0)
300
301
static void PredictorAdd10_SSE2(const uint32_t* in, const uint32_t* upper,
302
int num_pixels, uint32_t* WEBP_RESTRICT out) {
303
int i;
304
__m128i L = _mm_cvtsi32_si128((int)out[-1]);
305
for (i = 0; i + 4 <= num_pixels; i += 4) {
306
__m128i src = _mm_loadu_si128((const __m128i*)&in[i]);
307
__m128i TL = _mm_loadu_si128((const __m128i*)&upper[i - 1]);
308
const __m128i T = _mm_loadu_si128((const __m128i*)&upper[i]);
309
const __m128i TR = _mm_loadu_si128((const __m128i*)&upper[i + 1]);
310
__m128i avgTTR;
311
Average2_m128i(&T, &TR, &avgTTR);
312
DO_PRED10(0);
313
DO_PRED10_SHIFT;
314
DO_PRED10(1);
315
DO_PRED10_SHIFT;
316
DO_PRED10(2);
317
DO_PRED10_SHIFT;
318
DO_PRED10(3);
319
}
320
if (i != num_pixels) {
321
VP8LPredictorsAdd_C[10](in + i, upper + i, num_pixels - i, out + i);
322
}
323
}
324
#undef DO_PRED10
325
#undef DO_PRED10_SHIFT
326
327
// Predictor11: select.
328
#define DO_PRED11(OUT) do { \
329
const __m128i L_lo = _mm_unpacklo_epi32(L, T); \
330
const __m128i TL_lo = _mm_unpacklo_epi32(TL, T); \
331
const __m128i pb = _mm_sad_epu8(L_lo, TL_lo); /* pb = sum |L-TL|*/ \
332
const __m128i mask = _mm_cmpgt_epi32(pb, pa); \
333
const __m128i A = _mm_and_si128(mask, L); \
334
const __m128i B = _mm_andnot_si128(mask, T); \
335
const __m128i pred = _mm_or_si128(A, B); /* pred = (pa > b)? L : T*/ \
336
L = _mm_add_epi8(src, pred); \
337
out[i + (OUT)] = (uint32_t)_mm_cvtsi128_si32(L); \
338
} while (0)
339
340
#define DO_PRED11_SHIFT do { \
341
/* Shift the pre-computed value for the next iteration.*/ \
342
T = _mm_srli_si128(T, 4); \
343
TL = _mm_srli_si128(TL, 4); \
344
src = _mm_srli_si128(src, 4); \
345
pa = _mm_srli_si128(pa, 4); \
346
} while (0)
347
348
static void PredictorAdd11_SSE2(const uint32_t* in, const uint32_t* upper,
349
int num_pixels, uint32_t* WEBP_RESTRICT out) {
350
int i;
351
__m128i pa;
352
__m128i L = _mm_cvtsi32_si128((int)out[-1]);
353
for (i = 0; i + 4 <= num_pixels; i += 4) {
354
__m128i T = _mm_loadu_si128((const __m128i*)&upper[i]);
355
__m128i TL = _mm_loadu_si128((const __m128i*)&upper[i - 1]);
356
__m128i src = _mm_loadu_si128((const __m128i*)&in[i]);
357
{
358
// We can unpack with any value on the upper 32 bits, provided it's the
359
// same on both operands (so that their sum of abs diff is zero). Here we
360
// use T.
361
const __m128i T_lo = _mm_unpacklo_epi32(T, T);
362
const __m128i TL_lo = _mm_unpacklo_epi32(TL, T);
363
const __m128i T_hi = _mm_unpackhi_epi32(T, T);
364
const __m128i TL_hi = _mm_unpackhi_epi32(TL, T);
365
const __m128i s_lo = _mm_sad_epu8(T_lo, TL_lo);
366
const __m128i s_hi = _mm_sad_epu8(T_hi, TL_hi);
367
pa = _mm_packs_epi32(s_lo, s_hi); // pa = sum |T-TL|
368
}
369
DO_PRED11(0);
370
DO_PRED11_SHIFT;
371
DO_PRED11(1);
372
DO_PRED11_SHIFT;
373
DO_PRED11(2);
374
DO_PRED11_SHIFT;
375
DO_PRED11(3);
376
}
377
if (i != num_pixels) {
378
VP8LPredictorsAdd_C[11](in + i, upper + i, num_pixels - i, out + i);
379
}
380
}
381
#undef DO_PRED11
382
#undef DO_PRED11_SHIFT
383
384
// Predictor12: ClampedAddSubtractFull.
385
#define DO_PRED12(DIFF, LANE, OUT) do { \
386
const __m128i all = _mm_add_epi16(L, (DIFF)); \
387
const __m128i alls = _mm_packus_epi16(all, all); \
388
const __m128i res = _mm_add_epi8(src, alls); \
389
out[i + (OUT)] = (uint32_t)_mm_cvtsi128_si32(res); \
390
L = _mm_unpacklo_epi8(res, zero); \
391
} while (0)
392
393
#define DO_PRED12_SHIFT(DIFF, LANE) do { \
394
/* Shift the pre-computed value for the next iteration.*/ \
395
if ((LANE) == 0) (DIFF) = _mm_srli_si128((DIFF), 8); \
396
src = _mm_srli_si128(src, 4); \
397
} while (0)
398
399
static void PredictorAdd12_SSE2(const uint32_t* in, const uint32_t* upper,
400
int num_pixels, uint32_t* WEBP_RESTRICT out) {
401
int i;
402
const __m128i zero = _mm_setzero_si128();
403
const __m128i L8 = _mm_cvtsi32_si128((int)out[-1]);
404
__m128i L = _mm_unpacklo_epi8(L8, zero);
405
for (i = 0; i + 4 <= num_pixels; i += 4) {
406
// Load 4 pixels at a time.
407
__m128i src = _mm_loadu_si128((const __m128i*)&in[i]);
408
const __m128i T = _mm_loadu_si128((const __m128i*)&upper[i]);
409
const __m128i T_lo = _mm_unpacklo_epi8(T, zero);
410
const __m128i T_hi = _mm_unpackhi_epi8(T, zero);
411
const __m128i TL = _mm_loadu_si128((const __m128i*)&upper[i - 1]);
412
const __m128i TL_lo = _mm_unpacklo_epi8(TL, zero);
413
const __m128i TL_hi = _mm_unpackhi_epi8(TL, zero);
414
__m128i diff_lo = _mm_sub_epi16(T_lo, TL_lo);
415
__m128i diff_hi = _mm_sub_epi16(T_hi, TL_hi);
416
DO_PRED12(diff_lo, 0, 0);
417
DO_PRED12_SHIFT(diff_lo, 0);
418
DO_PRED12(diff_lo, 1, 1);
419
DO_PRED12_SHIFT(diff_lo, 1);
420
DO_PRED12(diff_hi, 0, 2);
421
DO_PRED12_SHIFT(diff_hi, 0);
422
DO_PRED12(diff_hi, 1, 3);
423
}
424
if (i != num_pixels) {
425
VP8LPredictorsAdd_C[12](in + i, upper + i, num_pixels - i, out + i);
426
}
427
}
428
#undef DO_PRED12
429
#undef DO_PRED12_SHIFT
430
431
// Due to averages with integers, values cannot be accumulated in parallel for
432
// predictors 13.
433
GENERATE_PREDICTOR_ADD(Predictor13_SSE2, PredictorAdd13_SSE2)
434
435
//------------------------------------------------------------------------------
436
// Subtract-Green Transform
437
438
static void AddGreenToBlueAndRed_SSE2(const uint32_t* const src, int num_pixels,
439
uint32_t* dst) {
440
int i;
441
for (i = 0; i + 4 <= num_pixels; i += 4) {
442
const __m128i in = _mm_loadu_si128((const __m128i*)&src[i]); // argb
443
const __m128i A = _mm_srli_epi16(in, 8); // 0 a 0 g
444
const __m128i B = _mm_shufflelo_epi16(A, _MM_SHUFFLE(2, 2, 0, 0));
445
const __m128i C = _mm_shufflehi_epi16(B, _MM_SHUFFLE(2, 2, 0, 0)); // 0g0g
446
const __m128i out = _mm_add_epi8(in, C);
447
_mm_storeu_si128((__m128i*)&dst[i], out);
448
}
449
// fallthrough and finish off with plain-C
450
if (i != num_pixels) {
451
VP8LAddGreenToBlueAndRed_C(src + i, num_pixels - i, dst + i);
452
}
453
}
454
455
//------------------------------------------------------------------------------
456
// Color Transform
457
458
static void TransformColorInverse_SSE2(const VP8LMultipliers* const m,
459
const uint32_t* const src,
460
int num_pixels, uint32_t* dst) {
461
// sign-extended multiplying constants, pre-shifted by 5.
462
#define CST(X) (((int16_t)(m->X << 8)) >> 5) // sign-extend
463
#define MK_CST_16(HI, LO) \
464
_mm_set1_epi32((int)(((uint32_t)(HI) << 16) | ((LO) & 0xffff)))
465
const __m128i mults_rb = MK_CST_16(CST(green_to_red_), CST(green_to_blue_));
466
const __m128i mults_b2 = MK_CST_16(CST(red_to_blue_), 0);
467
#undef MK_CST_16
468
#undef CST
469
const __m128i mask_ag = _mm_set1_epi32((int)0xff00ff00); // alpha-green masks
470
int i;
471
for (i = 0; i + 4 <= num_pixels; i += 4) {
472
const __m128i in = _mm_loadu_si128((const __m128i*)&src[i]); // argb
473
const __m128i A = _mm_and_si128(in, mask_ag); // a 0 g 0
474
const __m128i B = _mm_shufflelo_epi16(A, _MM_SHUFFLE(2, 2, 0, 0));
475
const __m128i C = _mm_shufflehi_epi16(B, _MM_SHUFFLE(2, 2, 0, 0)); // g0g0
476
const __m128i D = _mm_mulhi_epi16(C, mults_rb); // x dr x db1
477
const __m128i E = _mm_add_epi8(in, D); // x r' x b'
478
const __m128i F = _mm_slli_epi16(E, 8); // r' 0 b' 0
479
const __m128i G = _mm_mulhi_epi16(F, mults_b2); // x db2 0 0
480
const __m128i H = _mm_srli_epi32(G, 8); // 0 x db2 0
481
const __m128i I = _mm_add_epi8(H, F); // r' x b'' 0
482
const __m128i J = _mm_srli_epi16(I, 8); // 0 r' 0 b''
483
const __m128i out = _mm_or_si128(J, A);
484
_mm_storeu_si128((__m128i*)&dst[i], out);
485
}
486
// Fall-back to C-version for left-overs.
487
if (i != num_pixels) {
488
VP8LTransformColorInverse_C(m, src + i, num_pixels - i, dst + i);
489
}
490
}
491
492
//------------------------------------------------------------------------------
493
// Color-space conversion functions
494
495
static void ConvertBGRAToRGB_SSE2(const uint32_t* WEBP_RESTRICT src,
496
int num_pixels, uint8_t* WEBP_RESTRICT dst) {
497
const __m128i* in = (const __m128i*)src;
498
__m128i* out = (__m128i*)dst;
499
500
while (num_pixels >= 32) {
501
// Load the BGRA buffers.
502
__m128i in0 = _mm_loadu_si128(in + 0);
503
__m128i in1 = _mm_loadu_si128(in + 1);
504
__m128i in2 = _mm_loadu_si128(in + 2);
505
__m128i in3 = _mm_loadu_si128(in + 3);
506
__m128i in4 = _mm_loadu_si128(in + 4);
507
__m128i in5 = _mm_loadu_si128(in + 5);
508
__m128i in6 = _mm_loadu_si128(in + 6);
509
__m128i in7 = _mm_loadu_si128(in + 7);
510
VP8L32bToPlanar_SSE2(&in0, &in1, &in2, &in3);
511
VP8L32bToPlanar_SSE2(&in4, &in5, &in6, &in7);
512
// At this points, in1/in5 contains red only, in2/in6 green only ...
513
// Pack the colors in 24b RGB.
514
VP8PlanarTo24b_SSE2(&in1, &in5, &in2, &in6, &in3, &in7);
515
_mm_storeu_si128(out + 0, in1);
516
_mm_storeu_si128(out + 1, in5);
517
_mm_storeu_si128(out + 2, in2);
518
_mm_storeu_si128(out + 3, in6);
519
_mm_storeu_si128(out + 4, in3);
520
_mm_storeu_si128(out + 5, in7);
521
in += 8;
522
out += 6;
523
num_pixels -= 32;
524
}
525
// left-overs
526
if (num_pixels > 0) {
527
VP8LConvertBGRAToRGB_C((const uint32_t*)in, num_pixels, (uint8_t*)out);
528
}
529
}
530
531
static void ConvertBGRAToRGBA_SSE2(const uint32_t* WEBP_RESTRICT src,
532
int num_pixels, uint8_t* WEBP_RESTRICT dst) {
533
const __m128i red_blue_mask = _mm_set1_epi32(0x00ff00ff);
534
const __m128i* in = (const __m128i*)src;
535
__m128i* out = (__m128i*)dst;
536
while (num_pixels >= 8) {
537
const __m128i A1 = _mm_loadu_si128(in++);
538
const __m128i A2 = _mm_loadu_si128(in++);
539
const __m128i B1 = _mm_and_si128(A1, red_blue_mask); // R 0 B 0
540
const __m128i B2 = _mm_and_si128(A2, red_blue_mask); // R 0 B 0
541
const __m128i C1 = _mm_andnot_si128(red_blue_mask, A1); // 0 G 0 A
542
const __m128i C2 = _mm_andnot_si128(red_blue_mask, A2); // 0 G 0 A
543
const __m128i D1 = _mm_shufflelo_epi16(B1, _MM_SHUFFLE(2, 3, 0, 1));
544
const __m128i D2 = _mm_shufflelo_epi16(B2, _MM_SHUFFLE(2, 3, 0, 1));
545
const __m128i E1 = _mm_shufflehi_epi16(D1, _MM_SHUFFLE(2, 3, 0, 1));
546
const __m128i E2 = _mm_shufflehi_epi16(D2, _MM_SHUFFLE(2, 3, 0, 1));
547
const __m128i F1 = _mm_or_si128(E1, C1);
548
const __m128i F2 = _mm_or_si128(E2, C2);
549
_mm_storeu_si128(out++, F1);
550
_mm_storeu_si128(out++, F2);
551
num_pixels -= 8;
552
}
553
// left-overs
554
if (num_pixels > 0) {
555
VP8LConvertBGRAToRGBA_C((const uint32_t*)in, num_pixels, (uint8_t*)out);
556
}
557
}
558
559
static void ConvertBGRAToRGBA4444_SSE2(const uint32_t* WEBP_RESTRICT src,
560
int num_pixels,
561
uint8_t* WEBP_RESTRICT dst) {
562
const __m128i mask_0x0f = _mm_set1_epi8(0x0f);
563
const __m128i mask_0xf0 = _mm_set1_epi8((char)0xf0);
564
const __m128i* in = (const __m128i*)src;
565
__m128i* out = (__m128i*)dst;
566
while (num_pixels >= 8) {
567
const __m128i bgra0 = _mm_loadu_si128(in++); // bgra0|bgra1|bgra2|bgra3
568
const __m128i bgra4 = _mm_loadu_si128(in++); // bgra4|bgra5|bgra6|bgra7
569
const __m128i v0l = _mm_unpacklo_epi8(bgra0, bgra4); // b0b4g0g4r0r4a0a4...
570
const __m128i v0h = _mm_unpackhi_epi8(bgra0, bgra4); // b2b6g2g6r2r6a2a6...
571
const __m128i v1l = _mm_unpacklo_epi8(v0l, v0h); // b0b2b4b6g0g2g4g6...
572
const __m128i v1h = _mm_unpackhi_epi8(v0l, v0h); // b1b3b5b7g1g3g5g7...
573
const __m128i v2l = _mm_unpacklo_epi8(v1l, v1h); // b0...b7 | g0...g7
574
const __m128i v2h = _mm_unpackhi_epi8(v1l, v1h); // r0...r7 | a0...a7
575
const __m128i ga0 = _mm_unpackhi_epi64(v2l, v2h); // g0...g7 | a0...a7
576
const __m128i rb0 = _mm_unpacklo_epi64(v2h, v2l); // r0...r7 | b0...b7
577
const __m128i ga1 = _mm_srli_epi16(ga0, 4); // g0-|g1-|...|a6-|a7-
578
const __m128i rb1 = _mm_and_si128(rb0, mask_0xf0); // -r0|-r1|...|-b6|-a7
579
const __m128i ga2 = _mm_and_si128(ga1, mask_0x0f); // g0-|g1-|...|a6-|a7-
580
const __m128i rgba0 = _mm_or_si128(ga2, rb1); // rg0..rg7 | ba0..ba7
581
const __m128i rgba1 = _mm_srli_si128(rgba0, 8); // ba0..ba7 | 0
582
#if (WEBP_SWAP_16BIT_CSP == 1)
583
const __m128i rgba = _mm_unpacklo_epi8(rgba1, rgba0); // barg0...barg7
584
#else
585
const __m128i rgba = _mm_unpacklo_epi8(rgba0, rgba1); // rgba0...rgba7
586
#endif
587
_mm_storeu_si128(out++, rgba);
588
num_pixels -= 8;
589
}
590
// left-overs
591
if (num_pixels > 0) {
592
VP8LConvertBGRAToRGBA4444_C((const uint32_t*)in, num_pixels, (uint8_t*)out);
593
}
594
}
595
596
static void ConvertBGRAToRGB565_SSE2(const uint32_t* WEBP_RESTRICT src,
597
int num_pixels,
598
uint8_t* WEBP_RESTRICT dst) {
599
const __m128i mask_0xe0 = _mm_set1_epi8((char)0xe0);
600
const __m128i mask_0xf8 = _mm_set1_epi8((char)0xf8);
601
const __m128i mask_0x07 = _mm_set1_epi8(0x07);
602
const __m128i* in = (const __m128i*)src;
603
__m128i* out = (__m128i*)dst;
604
while (num_pixels >= 8) {
605
const __m128i bgra0 = _mm_loadu_si128(in++); // bgra0|bgra1|bgra2|bgra3
606
const __m128i bgra4 = _mm_loadu_si128(in++); // bgra4|bgra5|bgra6|bgra7
607
const __m128i v0l = _mm_unpacklo_epi8(bgra0, bgra4); // b0b4g0g4r0r4a0a4...
608
const __m128i v0h = _mm_unpackhi_epi8(bgra0, bgra4); // b2b6g2g6r2r6a2a6...
609
const __m128i v1l = _mm_unpacklo_epi8(v0l, v0h); // b0b2b4b6g0g2g4g6...
610
const __m128i v1h = _mm_unpackhi_epi8(v0l, v0h); // b1b3b5b7g1g3g5g7...
611
const __m128i v2l = _mm_unpacklo_epi8(v1l, v1h); // b0...b7 | g0...g7
612
const __m128i v2h = _mm_unpackhi_epi8(v1l, v1h); // r0...r7 | a0...a7
613
const __m128i ga0 = _mm_unpackhi_epi64(v2l, v2h); // g0...g7 | a0...a7
614
const __m128i rb0 = _mm_unpacklo_epi64(v2h, v2l); // r0...r7 | b0...b7
615
const __m128i rb1 = _mm_and_si128(rb0, mask_0xf8); // -r0..-r7|-b0..-b7
616
const __m128i g_lo1 = _mm_srli_epi16(ga0, 5);
617
const __m128i g_lo2 = _mm_and_si128(g_lo1, mask_0x07); // g0-...g7-|xx (3b)
618
const __m128i g_hi1 = _mm_slli_epi16(ga0, 3);
619
const __m128i g_hi2 = _mm_and_si128(g_hi1, mask_0xe0); // -g0...-g7|xx (3b)
620
const __m128i b0 = _mm_srli_si128(rb1, 8); // -b0...-b7|0
621
const __m128i rg1 = _mm_or_si128(rb1, g_lo2); // gr0...gr7|xx
622
const __m128i b1 = _mm_srli_epi16(b0, 3);
623
const __m128i gb1 = _mm_or_si128(b1, g_hi2); // bg0...bg7|xx
624
#if (WEBP_SWAP_16BIT_CSP == 1)
625
const __m128i rgba = _mm_unpacklo_epi8(gb1, rg1); // rggb0...rggb7
626
#else
627
const __m128i rgba = _mm_unpacklo_epi8(rg1, gb1); // bgrb0...bgrb7
628
#endif
629
_mm_storeu_si128(out++, rgba);
630
num_pixels -= 8;
631
}
632
// left-overs
633
if (num_pixels > 0) {
634
VP8LConvertBGRAToRGB565_C((const uint32_t*)in, num_pixels, (uint8_t*)out);
635
}
636
}
637
638
static void ConvertBGRAToBGR_SSE2(const uint32_t* WEBP_RESTRICT src,
639
int num_pixels, uint8_t* WEBP_RESTRICT dst) {
640
const __m128i mask_l = _mm_set_epi32(0, 0x00ffffff, 0, 0x00ffffff);
641
const __m128i mask_h = _mm_set_epi32(0x00ffffff, 0, 0x00ffffff, 0);
642
const __m128i* in = (const __m128i*)src;
643
const uint8_t* const end = dst + num_pixels * 3;
644
// the last storel_epi64 below writes 8 bytes starting at offset 18
645
while (dst + 26 <= end) {
646
const __m128i bgra0 = _mm_loadu_si128(in++); // bgra0|bgra1|bgra2|bgra3
647
const __m128i bgra4 = _mm_loadu_si128(in++); // bgra4|bgra5|bgra6|bgra7
648
const __m128i a0l = _mm_and_si128(bgra0, mask_l); // bgr0|0|bgr0|0
649
const __m128i a4l = _mm_and_si128(bgra4, mask_l); // bgr0|0|bgr0|0
650
const __m128i a0h = _mm_and_si128(bgra0, mask_h); // 0|bgr0|0|bgr0
651
const __m128i a4h = _mm_and_si128(bgra4, mask_h); // 0|bgr0|0|bgr0
652
const __m128i b0h = _mm_srli_epi64(a0h, 8); // 000b|gr00|000b|gr00
653
const __m128i b4h = _mm_srli_epi64(a4h, 8); // 000b|gr00|000b|gr00
654
const __m128i c0 = _mm_or_si128(a0l, b0h); // rgbrgb00|rgbrgb00
655
const __m128i c4 = _mm_or_si128(a4l, b4h); // rgbrgb00|rgbrgb00
656
const __m128i c2 = _mm_srli_si128(c0, 8);
657
const __m128i c6 = _mm_srli_si128(c4, 8);
658
_mm_storel_epi64((__m128i*)(dst + 0), c0);
659
_mm_storel_epi64((__m128i*)(dst + 6), c2);
660
_mm_storel_epi64((__m128i*)(dst + 12), c4);
661
_mm_storel_epi64((__m128i*)(dst + 18), c6);
662
dst += 24;
663
num_pixels -= 8;
664
}
665
// left-overs
666
if (num_pixels > 0) {
667
VP8LConvertBGRAToBGR_C((const uint32_t*)in, num_pixels, dst);
668
}
669
}
670
671
//------------------------------------------------------------------------------
672
// Entry point
673
674
extern void VP8LDspInitSSE2(void);
675
676
WEBP_TSAN_IGNORE_FUNCTION void VP8LDspInitSSE2(void) {
677
VP8LPredictors[5] = Predictor5_SSE2;
678
VP8LPredictors[6] = Predictor6_SSE2;
679
VP8LPredictors[7] = Predictor7_SSE2;
680
VP8LPredictors[8] = Predictor8_SSE2;
681
VP8LPredictors[9] = Predictor9_SSE2;
682
VP8LPredictors[10] = Predictor10_SSE2;
683
VP8LPredictors[11] = Predictor11_SSE2;
684
VP8LPredictors[12] = Predictor12_SSE2;
685
VP8LPredictors[13] = Predictor13_SSE2;
686
687
VP8LPredictorsAdd[0] = PredictorAdd0_SSE2;
688
VP8LPredictorsAdd[1] = PredictorAdd1_SSE2;
689
VP8LPredictorsAdd[2] = PredictorAdd2_SSE2;
690
VP8LPredictorsAdd[3] = PredictorAdd3_SSE2;
691
VP8LPredictorsAdd[4] = PredictorAdd4_SSE2;
692
VP8LPredictorsAdd[5] = PredictorAdd5_SSE2;
693
VP8LPredictorsAdd[6] = PredictorAdd6_SSE2;
694
VP8LPredictorsAdd[7] = PredictorAdd7_SSE2;
695
VP8LPredictorsAdd[8] = PredictorAdd8_SSE2;
696
VP8LPredictorsAdd[9] = PredictorAdd9_SSE2;
697
VP8LPredictorsAdd[10] = PredictorAdd10_SSE2;
698
VP8LPredictorsAdd[11] = PredictorAdd11_SSE2;
699
VP8LPredictorsAdd[12] = PredictorAdd12_SSE2;
700
VP8LPredictorsAdd[13] = PredictorAdd13_SSE2;
701
702
VP8LAddGreenToBlueAndRed = AddGreenToBlueAndRed_SSE2;
703
VP8LTransformColorInverse = TransformColorInverse_SSE2;
704
705
VP8LConvertBGRAToRGB = ConvertBGRAToRGB_SSE2;
706
VP8LConvertBGRAToRGBA = ConvertBGRAToRGBA_SSE2;
707
VP8LConvertBGRAToRGBA4444 = ConvertBGRAToRGBA4444_SSE2;
708
VP8LConvertBGRAToRGB565 = ConvertBGRAToRGB565_SSE2;
709
VP8LConvertBGRAToBGR = ConvertBGRAToBGR_SSE2;
710
}
711
712
#else // !WEBP_USE_SSE2
713
714
WEBP_DSP_INIT_STUB(VP8LDspInitSSE2)
715
716
#endif // WEBP_USE_SSE2
717
718