Path: blob/master/thirdparty/libwebp/src/dsp/lossless_sse2.c
9918 views
// Copyright 2014 Google Inc. All Rights Reserved.1//2// Use of this source code is governed by a BSD-style license3// that can be found in the COPYING file in the root of the source4// tree. An additional intellectual property rights grant can be found5// in the file PATENTS. All contributing project authors may6// be found in the AUTHORS file in the root of the source tree.7// -----------------------------------------------------------------------------8//9// SSE2 variant of methods for lossless decoder10//11// Author: Skal ([email protected])1213#include "src/dsp/dsp.h"1415#if defined(WEBP_USE_SSE2)1617#include "src/dsp/common_sse2.h"18#include "src/dsp/lossless.h"19#include "src/dsp/lossless_common.h"20#include <emmintrin.h>2122//------------------------------------------------------------------------------23// Predictor Transform2425static WEBP_INLINE uint32_t ClampedAddSubtractFull_SSE2(uint32_t c0,26uint32_t c1,27uint32_t c2) {28const __m128i zero = _mm_setzero_si128();29const __m128i C0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128((int)c0), zero);30const __m128i C1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128((int)c1), zero);31const __m128i C2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128((int)c2), zero);32const __m128i V1 = _mm_add_epi16(C0, C1);33const __m128i V2 = _mm_sub_epi16(V1, C2);34const __m128i b = _mm_packus_epi16(V2, V2);35return (uint32_t)_mm_cvtsi128_si32(b);36}3738static WEBP_INLINE uint32_t ClampedAddSubtractHalf_SSE2(uint32_t c0,39uint32_t c1,40uint32_t c2) {41const __m128i zero = _mm_setzero_si128();42const __m128i C0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128((int)c0), zero);43const __m128i C1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128((int)c1), zero);44const __m128i B0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128((int)c2), zero);45const __m128i avg = _mm_add_epi16(C1, C0);46const __m128i A0 = _mm_srli_epi16(avg, 1);47const __m128i A1 = _mm_sub_epi16(A0, B0);48const __m128i BgtA = _mm_cmpgt_epi16(B0, A0);49const __m128i A2 = _mm_sub_epi16(A1, BgtA);50const __m128i A3 = _mm_srai_epi16(A2, 1);51const __m128i A4 = _mm_add_epi16(A0, A3);52const __m128i A5 = _mm_packus_epi16(A4, A4);53return (uint32_t)_mm_cvtsi128_si32(A5);54}5556static WEBP_INLINE uint32_t Select_SSE2(uint32_t a, uint32_t b, uint32_t c) {57int pa_minus_pb;58const __m128i zero = _mm_setzero_si128();59const __m128i A0 = _mm_cvtsi32_si128((int)a);60const __m128i B0 = _mm_cvtsi32_si128((int)b);61const __m128i C0 = _mm_cvtsi32_si128((int)c);62const __m128i AC0 = _mm_subs_epu8(A0, C0);63const __m128i CA0 = _mm_subs_epu8(C0, A0);64const __m128i BC0 = _mm_subs_epu8(B0, C0);65const __m128i CB0 = _mm_subs_epu8(C0, B0);66const __m128i AC = _mm_or_si128(AC0, CA0);67const __m128i BC = _mm_or_si128(BC0, CB0);68const __m128i pa = _mm_unpacklo_epi8(AC, zero); // |a - c|69const __m128i pb = _mm_unpacklo_epi8(BC, zero); // |b - c|70const __m128i diff = _mm_sub_epi16(pb, pa);71{72int16_t out[8];73_mm_storeu_si128((__m128i*)out, diff);74pa_minus_pb = out[0] + out[1] + out[2] + out[3];75}76return (pa_minus_pb <= 0) ? a : b;77}7879static WEBP_INLINE void Average2_m128i(const __m128i* const a0,80const __m128i* const a1,81__m128i* const avg) {82// (a + b) >> 1 = ((a + b + 1) >> 1) - ((a ^ b) & 1)83const __m128i ones = _mm_set1_epi8(1);84const __m128i avg1 = _mm_avg_epu8(*a0, *a1);85const __m128i one = _mm_and_si128(_mm_xor_si128(*a0, *a1), ones);86*avg = _mm_sub_epi8(avg1, one);87}8889static WEBP_INLINE void Average2_uint32_SSE2(const uint32_t a0,90const uint32_t a1,91__m128i* const avg) {92// (a + b) >> 1 = ((a + b + 1) >> 1) - ((a ^ b) & 1)93const __m128i ones = _mm_set1_epi8(1);94const __m128i A0 = _mm_cvtsi32_si128((int)a0);95const __m128i A1 = _mm_cvtsi32_si128((int)a1);96const __m128i avg1 = _mm_avg_epu8(A0, A1);97const __m128i one = _mm_and_si128(_mm_xor_si128(A0, A1), ones);98*avg = _mm_sub_epi8(avg1, one);99}100101static WEBP_INLINE __m128i Average2_uint32_16_SSE2(uint32_t a0, uint32_t a1) {102const __m128i zero = _mm_setzero_si128();103const __m128i A0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128((int)a0), zero);104const __m128i A1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128((int)a1), zero);105const __m128i sum = _mm_add_epi16(A1, A0);106return _mm_srli_epi16(sum, 1);107}108109static WEBP_INLINE uint32_t Average2_SSE2(uint32_t a0, uint32_t a1) {110__m128i output;111Average2_uint32_SSE2(a0, a1, &output);112return (uint32_t)_mm_cvtsi128_si32(output);113}114115static WEBP_INLINE uint32_t Average3_SSE2(uint32_t a0, uint32_t a1,116uint32_t a2) {117const __m128i zero = _mm_setzero_si128();118const __m128i avg1 = Average2_uint32_16_SSE2(a0, a2);119const __m128i A1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128((int)a1), zero);120const __m128i sum = _mm_add_epi16(avg1, A1);121const __m128i avg2 = _mm_srli_epi16(sum, 1);122const __m128i A2 = _mm_packus_epi16(avg2, avg2);123return (uint32_t)_mm_cvtsi128_si32(A2);124}125126static WEBP_INLINE uint32_t Average4_SSE2(uint32_t a0, uint32_t a1,127uint32_t a2, uint32_t a3) {128const __m128i avg1 = Average2_uint32_16_SSE2(a0, a1);129const __m128i avg2 = Average2_uint32_16_SSE2(a2, a3);130const __m128i sum = _mm_add_epi16(avg2, avg1);131const __m128i avg3 = _mm_srli_epi16(sum, 1);132const __m128i A0 = _mm_packus_epi16(avg3, avg3);133return (uint32_t)_mm_cvtsi128_si32(A0);134}135136static uint32_t Predictor5_SSE2(const uint32_t* const left,137const uint32_t* const top) {138const uint32_t pred = Average3_SSE2(*left, top[0], top[1]);139return pred;140}141static uint32_t Predictor6_SSE2(const uint32_t* const left,142const uint32_t* const top) {143const uint32_t pred = Average2_SSE2(*left, top[-1]);144return pred;145}146static uint32_t Predictor7_SSE2(const uint32_t* const left,147const uint32_t* const top) {148const uint32_t pred = Average2_SSE2(*left, top[0]);149return pred;150}151static uint32_t Predictor8_SSE2(const uint32_t* const left,152const uint32_t* const top) {153const uint32_t pred = Average2_SSE2(top[-1], top[0]);154(void)left;155return pred;156}157static uint32_t Predictor9_SSE2(const uint32_t* const left,158const uint32_t* const top) {159const uint32_t pred = Average2_SSE2(top[0], top[1]);160(void)left;161return pred;162}163static uint32_t Predictor10_SSE2(const uint32_t* const left,164const uint32_t* const top) {165const uint32_t pred = Average4_SSE2(*left, top[-1], top[0], top[1]);166return pred;167}168static uint32_t Predictor11_SSE2(const uint32_t* const left,169const uint32_t* const top) {170const uint32_t pred = Select_SSE2(top[0], *left, top[-1]);171return pred;172}173static uint32_t Predictor12_SSE2(const uint32_t* const left,174const uint32_t* const top) {175const uint32_t pred = ClampedAddSubtractFull_SSE2(*left, top[0], top[-1]);176return pred;177}178static uint32_t Predictor13_SSE2(const uint32_t* const left,179const uint32_t* const top) {180const uint32_t pred = ClampedAddSubtractHalf_SSE2(*left, top[0], top[-1]);181return pred;182}183184// Batch versions of those functions.185186// Predictor0: ARGB_BLACK.187static void PredictorAdd0_SSE2(const uint32_t* in, const uint32_t* upper,188int num_pixels, uint32_t* WEBP_RESTRICT out) {189int i;190const __m128i black = _mm_set1_epi32((int)ARGB_BLACK);191for (i = 0; i + 4 <= num_pixels; i += 4) {192const __m128i src = _mm_loadu_si128((const __m128i*)&in[i]);193const __m128i res = _mm_add_epi8(src, black);194_mm_storeu_si128((__m128i*)&out[i], res);195}196if (i != num_pixels) {197VP8LPredictorsAdd_C[0](in + i, NULL, num_pixels - i, out + i);198}199(void)upper;200}201202// Predictor1: left.203static void PredictorAdd1_SSE2(const uint32_t* in, const uint32_t* upper,204int num_pixels, uint32_t* WEBP_RESTRICT out) {205int i;206__m128i prev = _mm_set1_epi32((int)out[-1]);207for (i = 0; i + 4 <= num_pixels; i += 4) {208// a | b | c | d209const __m128i src = _mm_loadu_si128((const __m128i*)&in[i]);210// 0 | a | b | c211const __m128i shift0 = _mm_slli_si128(src, 4);212// a | a + b | b + c | c + d213const __m128i sum0 = _mm_add_epi8(src, shift0);214// 0 | 0 | a | a + b215const __m128i shift1 = _mm_slli_si128(sum0, 8);216// a | a + b | a + b + c | a + b + c + d217const __m128i sum1 = _mm_add_epi8(sum0, shift1);218const __m128i res = _mm_add_epi8(sum1, prev);219_mm_storeu_si128((__m128i*)&out[i], res);220// replicate prev output on the four lanes221prev = _mm_shuffle_epi32(res, (3 << 0) | (3 << 2) | (3 << 4) | (3 << 6));222}223if (i != num_pixels) {224VP8LPredictorsAdd_C[1](in + i, upper + i, num_pixels - i, out + i);225}226}227228// Macro that adds 32-bit integers from IN using mod 256 arithmetic229// per 8 bit channel.230#define GENERATE_PREDICTOR_1(X, IN) \231static void PredictorAdd##X##_SSE2(const uint32_t* in, const uint32_t* upper, \232int num_pixels, \233uint32_t* WEBP_RESTRICT out) { \234int i; \235for (i = 0; i + 4 <= num_pixels; i += 4) { \236const __m128i src = _mm_loadu_si128((const __m128i*)&in[i]); \237const __m128i other = _mm_loadu_si128((const __m128i*)&(IN)); \238const __m128i res = _mm_add_epi8(src, other); \239_mm_storeu_si128((__m128i*)&out[i], res); \240} \241if (i != num_pixels) { \242VP8LPredictorsAdd_C[(X)](in + i, upper + i, num_pixels - i, out + i); \243} \244}245246// Predictor2: Top.247GENERATE_PREDICTOR_1(2, upper[i])248// Predictor3: Top-right.249GENERATE_PREDICTOR_1(3, upper[i + 1])250// Predictor4: Top-left.251GENERATE_PREDICTOR_1(4, upper[i - 1])252#undef GENERATE_PREDICTOR_1253254// Due to averages with integers, values cannot be accumulated in parallel for255// predictors 5 to 7.256GENERATE_PREDICTOR_ADD(Predictor5_SSE2, PredictorAdd5_SSE2)257GENERATE_PREDICTOR_ADD(Predictor6_SSE2, PredictorAdd6_SSE2)258GENERATE_PREDICTOR_ADD(Predictor7_SSE2, PredictorAdd7_SSE2)259260#define GENERATE_PREDICTOR_2(X, IN) \261static void PredictorAdd##X##_SSE2(const uint32_t* in, const uint32_t* upper, \262int num_pixels, \263uint32_t* WEBP_RESTRICT out) { \264int i; \265for (i = 0; i + 4 <= num_pixels; i += 4) { \266const __m128i Tother = _mm_loadu_si128((const __m128i*)&(IN)); \267const __m128i T = _mm_loadu_si128((const __m128i*)&upper[i]); \268const __m128i src = _mm_loadu_si128((const __m128i*)&in[i]); \269__m128i avg, res; \270Average2_m128i(&T, &Tother, &avg); \271res = _mm_add_epi8(avg, src); \272_mm_storeu_si128((__m128i*)&out[i], res); \273} \274if (i != num_pixels) { \275VP8LPredictorsAdd_C[(X)](in + i, upper + i, num_pixels - i, out + i); \276} \277}278// Predictor8: average TL T.279GENERATE_PREDICTOR_2(8, upper[i - 1])280// Predictor9: average T TR.281GENERATE_PREDICTOR_2(9, upper[i + 1])282#undef GENERATE_PREDICTOR_2283284// Predictor10: average of (average of (L,TL), average of (T, TR)).285#define DO_PRED10(OUT) do { \286__m128i avgLTL, avg; \287Average2_m128i(&L, &TL, &avgLTL); \288Average2_m128i(&avgTTR, &avgLTL, &avg); \289L = _mm_add_epi8(avg, src); \290out[i + (OUT)] = (uint32_t)_mm_cvtsi128_si32(L); \291} while (0)292293#define DO_PRED10_SHIFT do { \294/* Rotate the pre-computed values for the next iteration.*/ \295avgTTR = _mm_srli_si128(avgTTR, 4); \296TL = _mm_srli_si128(TL, 4); \297src = _mm_srli_si128(src, 4); \298} while (0)299300static void PredictorAdd10_SSE2(const uint32_t* in, const uint32_t* upper,301int num_pixels, uint32_t* WEBP_RESTRICT out) {302int i;303__m128i L = _mm_cvtsi32_si128((int)out[-1]);304for (i = 0; i + 4 <= num_pixels; i += 4) {305__m128i src = _mm_loadu_si128((const __m128i*)&in[i]);306__m128i TL = _mm_loadu_si128((const __m128i*)&upper[i - 1]);307const __m128i T = _mm_loadu_si128((const __m128i*)&upper[i]);308const __m128i TR = _mm_loadu_si128((const __m128i*)&upper[i + 1]);309__m128i avgTTR;310Average2_m128i(&T, &TR, &avgTTR);311DO_PRED10(0);312DO_PRED10_SHIFT;313DO_PRED10(1);314DO_PRED10_SHIFT;315DO_PRED10(2);316DO_PRED10_SHIFT;317DO_PRED10(3);318}319if (i != num_pixels) {320VP8LPredictorsAdd_C[10](in + i, upper + i, num_pixels - i, out + i);321}322}323#undef DO_PRED10324#undef DO_PRED10_SHIFT325326// Predictor11: select.327#define DO_PRED11(OUT) do { \328const __m128i L_lo = _mm_unpacklo_epi32(L, T); \329const __m128i TL_lo = _mm_unpacklo_epi32(TL, T); \330const __m128i pb = _mm_sad_epu8(L_lo, TL_lo); /* pb = sum |L-TL|*/ \331const __m128i mask = _mm_cmpgt_epi32(pb, pa); \332const __m128i A = _mm_and_si128(mask, L); \333const __m128i B = _mm_andnot_si128(mask, T); \334const __m128i pred = _mm_or_si128(A, B); /* pred = (pa > b)? L : T*/ \335L = _mm_add_epi8(src, pred); \336out[i + (OUT)] = (uint32_t)_mm_cvtsi128_si32(L); \337} while (0)338339#define DO_PRED11_SHIFT do { \340/* Shift the pre-computed value for the next iteration.*/ \341T = _mm_srli_si128(T, 4); \342TL = _mm_srli_si128(TL, 4); \343src = _mm_srli_si128(src, 4); \344pa = _mm_srli_si128(pa, 4); \345} while (0)346347static void PredictorAdd11_SSE2(const uint32_t* in, const uint32_t* upper,348int num_pixels, uint32_t* WEBP_RESTRICT out) {349int i;350__m128i pa;351__m128i L = _mm_cvtsi32_si128((int)out[-1]);352for (i = 0; i + 4 <= num_pixels; i += 4) {353__m128i T = _mm_loadu_si128((const __m128i*)&upper[i]);354__m128i TL = _mm_loadu_si128((const __m128i*)&upper[i - 1]);355__m128i src = _mm_loadu_si128((const __m128i*)&in[i]);356{357// We can unpack with any value on the upper 32 bits, provided it's the358// same on both operands (so that their sum of abs diff is zero). Here we359// use T.360const __m128i T_lo = _mm_unpacklo_epi32(T, T);361const __m128i TL_lo = _mm_unpacklo_epi32(TL, T);362const __m128i T_hi = _mm_unpackhi_epi32(T, T);363const __m128i TL_hi = _mm_unpackhi_epi32(TL, T);364const __m128i s_lo = _mm_sad_epu8(T_lo, TL_lo);365const __m128i s_hi = _mm_sad_epu8(T_hi, TL_hi);366pa = _mm_packs_epi32(s_lo, s_hi); // pa = sum |T-TL|367}368DO_PRED11(0);369DO_PRED11_SHIFT;370DO_PRED11(1);371DO_PRED11_SHIFT;372DO_PRED11(2);373DO_PRED11_SHIFT;374DO_PRED11(3);375}376if (i != num_pixels) {377VP8LPredictorsAdd_C[11](in + i, upper + i, num_pixels - i, out + i);378}379}380#undef DO_PRED11381#undef DO_PRED11_SHIFT382383// Predictor12: ClampedAddSubtractFull.384#define DO_PRED12(DIFF, LANE, OUT) do { \385const __m128i all = _mm_add_epi16(L, (DIFF)); \386const __m128i alls = _mm_packus_epi16(all, all); \387const __m128i res = _mm_add_epi8(src, alls); \388out[i + (OUT)] = (uint32_t)_mm_cvtsi128_si32(res); \389L = _mm_unpacklo_epi8(res, zero); \390} while (0)391392#define DO_PRED12_SHIFT(DIFF, LANE) do { \393/* Shift the pre-computed value for the next iteration.*/ \394if ((LANE) == 0) (DIFF) = _mm_srli_si128((DIFF), 8); \395src = _mm_srli_si128(src, 4); \396} while (0)397398static void PredictorAdd12_SSE2(const uint32_t* in, const uint32_t* upper,399int num_pixels, uint32_t* WEBP_RESTRICT out) {400int i;401const __m128i zero = _mm_setzero_si128();402const __m128i L8 = _mm_cvtsi32_si128((int)out[-1]);403__m128i L = _mm_unpacklo_epi8(L8, zero);404for (i = 0; i + 4 <= num_pixels; i += 4) {405// Load 4 pixels at a time.406__m128i src = _mm_loadu_si128((const __m128i*)&in[i]);407const __m128i T = _mm_loadu_si128((const __m128i*)&upper[i]);408const __m128i T_lo = _mm_unpacklo_epi8(T, zero);409const __m128i T_hi = _mm_unpackhi_epi8(T, zero);410const __m128i TL = _mm_loadu_si128((const __m128i*)&upper[i - 1]);411const __m128i TL_lo = _mm_unpacklo_epi8(TL, zero);412const __m128i TL_hi = _mm_unpackhi_epi8(TL, zero);413__m128i diff_lo = _mm_sub_epi16(T_lo, TL_lo);414__m128i diff_hi = _mm_sub_epi16(T_hi, TL_hi);415DO_PRED12(diff_lo, 0, 0);416DO_PRED12_SHIFT(diff_lo, 0);417DO_PRED12(diff_lo, 1, 1);418DO_PRED12_SHIFT(diff_lo, 1);419DO_PRED12(diff_hi, 0, 2);420DO_PRED12_SHIFT(diff_hi, 0);421DO_PRED12(diff_hi, 1, 3);422}423if (i != num_pixels) {424VP8LPredictorsAdd_C[12](in + i, upper + i, num_pixels - i, out + i);425}426}427#undef DO_PRED12428#undef DO_PRED12_SHIFT429430// Due to averages with integers, values cannot be accumulated in parallel for431// predictors 13.432GENERATE_PREDICTOR_ADD(Predictor13_SSE2, PredictorAdd13_SSE2)433434//------------------------------------------------------------------------------435// Subtract-Green Transform436437static void AddGreenToBlueAndRed_SSE2(const uint32_t* const src, int num_pixels,438uint32_t* dst) {439int i;440for (i = 0; i + 4 <= num_pixels; i += 4) {441const __m128i in = _mm_loadu_si128((const __m128i*)&src[i]); // argb442const __m128i A = _mm_srli_epi16(in, 8); // 0 a 0 g443const __m128i B = _mm_shufflelo_epi16(A, _MM_SHUFFLE(2, 2, 0, 0));444const __m128i C = _mm_shufflehi_epi16(B, _MM_SHUFFLE(2, 2, 0, 0)); // 0g0g445const __m128i out = _mm_add_epi8(in, C);446_mm_storeu_si128((__m128i*)&dst[i], out);447}448// fallthrough and finish off with plain-C449if (i != num_pixels) {450VP8LAddGreenToBlueAndRed_C(src + i, num_pixels - i, dst + i);451}452}453454//------------------------------------------------------------------------------455// Color Transform456457static void TransformColorInverse_SSE2(const VP8LMultipliers* const m,458const uint32_t* const src,459int num_pixels, uint32_t* dst) {460// sign-extended multiplying constants, pre-shifted by 5.461#define CST(X) (((int16_t)(m->X << 8)) >> 5) // sign-extend462#define MK_CST_16(HI, LO) \463_mm_set1_epi32((int)(((uint32_t)(HI) << 16) | ((LO) & 0xffff)))464const __m128i mults_rb = MK_CST_16(CST(green_to_red_), CST(green_to_blue_));465const __m128i mults_b2 = MK_CST_16(CST(red_to_blue_), 0);466#undef MK_CST_16467#undef CST468const __m128i mask_ag = _mm_set1_epi32((int)0xff00ff00); // alpha-green masks469int i;470for (i = 0; i + 4 <= num_pixels; i += 4) {471const __m128i in = _mm_loadu_si128((const __m128i*)&src[i]); // argb472const __m128i A = _mm_and_si128(in, mask_ag); // a 0 g 0473const __m128i B = _mm_shufflelo_epi16(A, _MM_SHUFFLE(2, 2, 0, 0));474const __m128i C = _mm_shufflehi_epi16(B, _MM_SHUFFLE(2, 2, 0, 0)); // g0g0475const __m128i D = _mm_mulhi_epi16(C, mults_rb); // x dr x db1476const __m128i E = _mm_add_epi8(in, D); // x r' x b'477const __m128i F = _mm_slli_epi16(E, 8); // r' 0 b' 0478const __m128i G = _mm_mulhi_epi16(F, mults_b2); // x db2 0 0479const __m128i H = _mm_srli_epi32(G, 8); // 0 x db2 0480const __m128i I = _mm_add_epi8(H, F); // r' x b'' 0481const __m128i J = _mm_srli_epi16(I, 8); // 0 r' 0 b''482const __m128i out = _mm_or_si128(J, A);483_mm_storeu_si128((__m128i*)&dst[i], out);484}485// Fall-back to C-version for left-overs.486if (i != num_pixels) {487VP8LTransformColorInverse_C(m, src + i, num_pixels - i, dst + i);488}489}490491//------------------------------------------------------------------------------492// Color-space conversion functions493494static void ConvertBGRAToRGB_SSE2(const uint32_t* WEBP_RESTRICT src,495int num_pixels, uint8_t* WEBP_RESTRICT dst) {496const __m128i* in = (const __m128i*)src;497__m128i* out = (__m128i*)dst;498499while (num_pixels >= 32) {500// Load the BGRA buffers.501__m128i in0 = _mm_loadu_si128(in + 0);502__m128i in1 = _mm_loadu_si128(in + 1);503__m128i in2 = _mm_loadu_si128(in + 2);504__m128i in3 = _mm_loadu_si128(in + 3);505__m128i in4 = _mm_loadu_si128(in + 4);506__m128i in5 = _mm_loadu_si128(in + 5);507__m128i in6 = _mm_loadu_si128(in + 6);508__m128i in7 = _mm_loadu_si128(in + 7);509VP8L32bToPlanar_SSE2(&in0, &in1, &in2, &in3);510VP8L32bToPlanar_SSE2(&in4, &in5, &in6, &in7);511// At this points, in1/in5 contains red only, in2/in6 green only ...512// Pack the colors in 24b RGB.513VP8PlanarTo24b_SSE2(&in1, &in5, &in2, &in6, &in3, &in7);514_mm_storeu_si128(out + 0, in1);515_mm_storeu_si128(out + 1, in5);516_mm_storeu_si128(out + 2, in2);517_mm_storeu_si128(out + 3, in6);518_mm_storeu_si128(out + 4, in3);519_mm_storeu_si128(out + 5, in7);520in += 8;521out += 6;522num_pixels -= 32;523}524// left-overs525if (num_pixels > 0) {526VP8LConvertBGRAToRGB_C((const uint32_t*)in, num_pixels, (uint8_t*)out);527}528}529530static void ConvertBGRAToRGBA_SSE2(const uint32_t* WEBP_RESTRICT src,531int num_pixels, uint8_t* WEBP_RESTRICT dst) {532const __m128i red_blue_mask = _mm_set1_epi32(0x00ff00ff);533const __m128i* in = (const __m128i*)src;534__m128i* out = (__m128i*)dst;535while (num_pixels >= 8) {536const __m128i A1 = _mm_loadu_si128(in++);537const __m128i A2 = _mm_loadu_si128(in++);538const __m128i B1 = _mm_and_si128(A1, red_blue_mask); // R 0 B 0539const __m128i B2 = _mm_and_si128(A2, red_blue_mask); // R 0 B 0540const __m128i C1 = _mm_andnot_si128(red_blue_mask, A1); // 0 G 0 A541const __m128i C2 = _mm_andnot_si128(red_blue_mask, A2); // 0 G 0 A542const __m128i D1 = _mm_shufflelo_epi16(B1, _MM_SHUFFLE(2, 3, 0, 1));543const __m128i D2 = _mm_shufflelo_epi16(B2, _MM_SHUFFLE(2, 3, 0, 1));544const __m128i E1 = _mm_shufflehi_epi16(D1, _MM_SHUFFLE(2, 3, 0, 1));545const __m128i E2 = _mm_shufflehi_epi16(D2, _MM_SHUFFLE(2, 3, 0, 1));546const __m128i F1 = _mm_or_si128(E1, C1);547const __m128i F2 = _mm_or_si128(E2, C2);548_mm_storeu_si128(out++, F1);549_mm_storeu_si128(out++, F2);550num_pixels -= 8;551}552// left-overs553if (num_pixels > 0) {554VP8LConvertBGRAToRGBA_C((const uint32_t*)in, num_pixels, (uint8_t*)out);555}556}557558static void ConvertBGRAToRGBA4444_SSE2(const uint32_t* WEBP_RESTRICT src,559int num_pixels,560uint8_t* WEBP_RESTRICT dst) {561const __m128i mask_0x0f = _mm_set1_epi8(0x0f);562const __m128i mask_0xf0 = _mm_set1_epi8((char)0xf0);563const __m128i* in = (const __m128i*)src;564__m128i* out = (__m128i*)dst;565while (num_pixels >= 8) {566const __m128i bgra0 = _mm_loadu_si128(in++); // bgra0|bgra1|bgra2|bgra3567const __m128i bgra4 = _mm_loadu_si128(in++); // bgra4|bgra5|bgra6|bgra7568const __m128i v0l = _mm_unpacklo_epi8(bgra0, bgra4); // b0b4g0g4r0r4a0a4...569const __m128i v0h = _mm_unpackhi_epi8(bgra0, bgra4); // b2b6g2g6r2r6a2a6...570const __m128i v1l = _mm_unpacklo_epi8(v0l, v0h); // b0b2b4b6g0g2g4g6...571const __m128i v1h = _mm_unpackhi_epi8(v0l, v0h); // b1b3b5b7g1g3g5g7...572const __m128i v2l = _mm_unpacklo_epi8(v1l, v1h); // b0...b7 | g0...g7573const __m128i v2h = _mm_unpackhi_epi8(v1l, v1h); // r0...r7 | a0...a7574const __m128i ga0 = _mm_unpackhi_epi64(v2l, v2h); // g0...g7 | a0...a7575const __m128i rb0 = _mm_unpacklo_epi64(v2h, v2l); // r0...r7 | b0...b7576const __m128i ga1 = _mm_srli_epi16(ga0, 4); // g0-|g1-|...|a6-|a7-577const __m128i rb1 = _mm_and_si128(rb0, mask_0xf0); // -r0|-r1|...|-b6|-a7578const __m128i ga2 = _mm_and_si128(ga1, mask_0x0f); // g0-|g1-|...|a6-|a7-579const __m128i rgba0 = _mm_or_si128(ga2, rb1); // rg0..rg7 | ba0..ba7580const __m128i rgba1 = _mm_srli_si128(rgba0, 8); // ba0..ba7 | 0581#if (WEBP_SWAP_16BIT_CSP == 1)582const __m128i rgba = _mm_unpacklo_epi8(rgba1, rgba0); // barg0...barg7583#else584const __m128i rgba = _mm_unpacklo_epi8(rgba0, rgba1); // rgba0...rgba7585#endif586_mm_storeu_si128(out++, rgba);587num_pixels -= 8;588}589// left-overs590if (num_pixels > 0) {591VP8LConvertBGRAToRGBA4444_C((const uint32_t*)in, num_pixels, (uint8_t*)out);592}593}594595static void ConvertBGRAToRGB565_SSE2(const uint32_t* WEBP_RESTRICT src,596int num_pixels,597uint8_t* WEBP_RESTRICT dst) {598const __m128i mask_0xe0 = _mm_set1_epi8((char)0xe0);599const __m128i mask_0xf8 = _mm_set1_epi8((char)0xf8);600const __m128i mask_0x07 = _mm_set1_epi8(0x07);601const __m128i* in = (const __m128i*)src;602__m128i* out = (__m128i*)dst;603while (num_pixels >= 8) {604const __m128i bgra0 = _mm_loadu_si128(in++); // bgra0|bgra1|bgra2|bgra3605const __m128i bgra4 = _mm_loadu_si128(in++); // bgra4|bgra5|bgra6|bgra7606const __m128i v0l = _mm_unpacklo_epi8(bgra0, bgra4); // b0b4g0g4r0r4a0a4...607const __m128i v0h = _mm_unpackhi_epi8(bgra0, bgra4); // b2b6g2g6r2r6a2a6...608const __m128i v1l = _mm_unpacklo_epi8(v0l, v0h); // b0b2b4b6g0g2g4g6...609const __m128i v1h = _mm_unpackhi_epi8(v0l, v0h); // b1b3b5b7g1g3g5g7...610const __m128i v2l = _mm_unpacklo_epi8(v1l, v1h); // b0...b7 | g0...g7611const __m128i v2h = _mm_unpackhi_epi8(v1l, v1h); // r0...r7 | a0...a7612const __m128i ga0 = _mm_unpackhi_epi64(v2l, v2h); // g0...g7 | a0...a7613const __m128i rb0 = _mm_unpacklo_epi64(v2h, v2l); // r0...r7 | b0...b7614const __m128i rb1 = _mm_and_si128(rb0, mask_0xf8); // -r0..-r7|-b0..-b7615const __m128i g_lo1 = _mm_srli_epi16(ga0, 5);616const __m128i g_lo2 = _mm_and_si128(g_lo1, mask_0x07); // g0-...g7-|xx (3b)617const __m128i g_hi1 = _mm_slli_epi16(ga0, 3);618const __m128i g_hi2 = _mm_and_si128(g_hi1, mask_0xe0); // -g0...-g7|xx (3b)619const __m128i b0 = _mm_srli_si128(rb1, 8); // -b0...-b7|0620const __m128i rg1 = _mm_or_si128(rb1, g_lo2); // gr0...gr7|xx621const __m128i b1 = _mm_srli_epi16(b0, 3);622const __m128i gb1 = _mm_or_si128(b1, g_hi2); // bg0...bg7|xx623#if (WEBP_SWAP_16BIT_CSP == 1)624const __m128i rgba = _mm_unpacklo_epi8(gb1, rg1); // rggb0...rggb7625#else626const __m128i rgba = _mm_unpacklo_epi8(rg1, gb1); // bgrb0...bgrb7627#endif628_mm_storeu_si128(out++, rgba);629num_pixels -= 8;630}631// left-overs632if (num_pixels > 0) {633VP8LConvertBGRAToRGB565_C((const uint32_t*)in, num_pixels, (uint8_t*)out);634}635}636637static void ConvertBGRAToBGR_SSE2(const uint32_t* WEBP_RESTRICT src,638int num_pixels, uint8_t* WEBP_RESTRICT dst) {639const __m128i mask_l = _mm_set_epi32(0, 0x00ffffff, 0, 0x00ffffff);640const __m128i mask_h = _mm_set_epi32(0x00ffffff, 0, 0x00ffffff, 0);641const __m128i* in = (const __m128i*)src;642const uint8_t* const end = dst + num_pixels * 3;643// the last storel_epi64 below writes 8 bytes starting at offset 18644while (dst + 26 <= end) {645const __m128i bgra0 = _mm_loadu_si128(in++); // bgra0|bgra1|bgra2|bgra3646const __m128i bgra4 = _mm_loadu_si128(in++); // bgra4|bgra5|bgra6|bgra7647const __m128i a0l = _mm_and_si128(bgra0, mask_l); // bgr0|0|bgr0|0648const __m128i a4l = _mm_and_si128(bgra4, mask_l); // bgr0|0|bgr0|0649const __m128i a0h = _mm_and_si128(bgra0, mask_h); // 0|bgr0|0|bgr0650const __m128i a4h = _mm_and_si128(bgra4, mask_h); // 0|bgr0|0|bgr0651const __m128i b0h = _mm_srli_epi64(a0h, 8); // 000b|gr00|000b|gr00652const __m128i b4h = _mm_srli_epi64(a4h, 8); // 000b|gr00|000b|gr00653const __m128i c0 = _mm_or_si128(a0l, b0h); // rgbrgb00|rgbrgb00654const __m128i c4 = _mm_or_si128(a4l, b4h); // rgbrgb00|rgbrgb00655const __m128i c2 = _mm_srli_si128(c0, 8);656const __m128i c6 = _mm_srli_si128(c4, 8);657_mm_storel_epi64((__m128i*)(dst + 0), c0);658_mm_storel_epi64((__m128i*)(dst + 6), c2);659_mm_storel_epi64((__m128i*)(dst + 12), c4);660_mm_storel_epi64((__m128i*)(dst + 18), c6);661dst += 24;662num_pixels -= 8;663}664// left-overs665if (num_pixels > 0) {666VP8LConvertBGRAToBGR_C((const uint32_t*)in, num_pixels, dst);667}668}669670//------------------------------------------------------------------------------671// Entry point672673extern void VP8LDspInitSSE2(void);674675WEBP_TSAN_IGNORE_FUNCTION void VP8LDspInitSSE2(void) {676VP8LPredictors[5] = Predictor5_SSE2;677VP8LPredictors[6] = Predictor6_SSE2;678VP8LPredictors[7] = Predictor7_SSE2;679VP8LPredictors[8] = Predictor8_SSE2;680VP8LPredictors[9] = Predictor9_SSE2;681VP8LPredictors[10] = Predictor10_SSE2;682VP8LPredictors[11] = Predictor11_SSE2;683VP8LPredictors[12] = Predictor12_SSE2;684VP8LPredictors[13] = Predictor13_SSE2;685686VP8LPredictorsAdd[0] = PredictorAdd0_SSE2;687VP8LPredictorsAdd[1] = PredictorAdd1_SSE2;688VP8LPredictorsAdd[2] = PredictorAdd2_SSE2;689VP8LPredictorsAdd[3] = PredictorAdd3_SSE2;690VP8LPredictorsAdd[4] = PredictorAdd4_SSE2;691VP8LPredictorsAdd[5] = PredictorAdd5_SSE2;692VP8LPredictorsAdd[6] = PredictorAdd6_SSE2;693VP8LPredictorsAdd[7] = PredictorAdd7_SSE2;694VP8LPredictorsAdd[8] = PredictorAdd8_SSE2;695VP8LPredictorsAdd[9] = PredictorAdd9_SSE2;696VP8LPredictorsAdd[10] = PredictorAdd10_SSE2;697VP8LPredictorsAdd[11] = PredictorAdd11_SSE2;698VP8LPredictorsAdd[12] = PredictorAdd12_SSE2;699VP8LPredictorsAdd[13] = PredictorAdd13_SSE2;700701VP8LAddGreenToBlueAndRed = AddGreenToBlueAndRed_SSE2;702VP8LTransformColorInverse = TransformColorInverse_SSE2;703704VP8LConvertBGRAToRGB = ConvertBGRAToRGB_SSE2;705VP8LConvertBGRAToRGBA = ConvertBGRAToRGBA_SSE2;706VP8LConvertBGRAToRGBA4444 = ConvertBGRAToRGBA4444_SSE2;707VP8LConvertBGRAToRGB565 = ConvertBGRAToRGB565_SSE2;708VP8LConvertBGRAToBGR = ConvertBGRAToBGR_SSE2;709}710711#else // !WEBP_USE_SSE2712713WEBP_DSP_INIT_STUB(VP8LDspInitSSE2)714715#endif // WEBP_USE_SSE2716717718