Path: blob/master/thirdparty/libwebp/src/dsp/lossless_sse2.c
21654 views
// Copyright 2014 Google Inc. All Rights Reserved.1//2// Use of this source code is governed by a BSD-style license3// that can be found in the COPYING file in the root of the source4// tree. An additional intellectual property rights grant can be found5// in the file PATENTS. All contributing project authors may6// be found in the AUTHORS file in the root of the source tree.7// -----------------------------------------------------------------------------8//9// SSE2 variant of methods for lossless decoder10//11// Author: Skal ([email protected])1213#include "src/dsp/dsp.h"1415#if defined(WEBP_USE_SSE2)1617#include <emmintrin.h>18#include <string.h>1920#include "src/dsp/common_sse2.h"21#include "src/dsp/cpu.h"22#include "src/dsp/lossless.h"23#include "src/dsp/lossless_common.h"24#include "src/webp/format_constants.h"25#include "src/webp/types.h"2627//------------------------------------------------------------------------------28// Predictor Transform2930static WEBP_INLINE uint32_t ClampedAddSubtractFull_SSE2(uint32_t c0,31uint32_t c1,32uint32_t c2) {33const __m128i zero = _mm_setzero_si128();34const __m128i C0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128((int)c0), zero);35const __m128i C1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128((int)c1), zero);36const __m128i C2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128((int)c2), zero);37const __m128i V1 = _mm_add_epi16(C0, C1);38const __m128i V2 = _mm_sub_epi16(V1, C2);39const __m128i b = _mm_packus_epi16(V2, V2);40return (uint32_t)_mm_cvtsi128_si32(b);41}4243static WEBP_INLINE uint32_t ClampedAddSubtractHalf_SSE2(uint32_t c0,44uint32_t c1,45uint32_t c2) {46const __m128i zero = _mm_setzero_si128();47const __m128i C0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128((int)c0), zero);48const __m128i C1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128((int)c1), zero);49const __m128i B0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128((int)c2), zero);50const __m128i avg = _mm_add_epi16(C1, C0);51const __m128i A0 = _mm_srli_epi16(avg, 1);52const __m128i A1 = _mm_sub_epi16(A0, B0);53const __m128i BgtA = _mm_cmpgt_epi16(B0, A0);54const __m128i A2 = _mm_sub_epi16(A1, BgtA);55const __m128i A3 = _mm_srai_epi16(A2, 1);56const __m128i A4 = _mm_add_epi16(A0, A3);57const __m128i A5 = _mm_packus_epi16(A4, A4);58return (uint32_t)_mm_cvtsi128_si32(A5);59}6061static WEBP_INLINE uint32_t Select_SSE2(uint32_t a, uint32_t b, uint32_t c) {62int pa_minus_pb;63const __m128i zero = _mm_setzero_si128();64const __m128i A0 = _mm_cvtsi32_si128((int)a);65const __m128i B0 = _mm_cvtsi32_si128((int)b);66const __m128i C0 = _mm_cvtsi32_si128((int)c);67const __m128i AC0 = _mm_subs_epu8(A0, C0);68const __m128i CA0 = _mm_subs_epu8(C0, A0);69const __m128i BC0 = _mm_subs_epu8(B0, C0);70const __m128i CB0 = _mm_subs_epu8(C0, B0);71const __m128i AC = _mm_or_si128(AC0, CA0);72const __m128i BC = _mm_or_si128(BC0, CB0);73const __m128i pa = _mm_unpacklo_epi8(AC, zero); // |a - c|74const __m128i pb = _mm_unpacklo_epi8(BC, zero); // |b - c|75const __m128i diff = _mm_sub_epi16(pb, pa);76{77int16_t out[8];78_mm_storeu_si128((__m128i*)out, diff);79pa_minus_pb = out[0] + out[1] + out[2] + out[3];80}81return (pa_minus_pb <= 0) ? a : b;82}8384static WEBP_INLINE void Average2_m128i(const __m128i* const a0,85const __m128i* const a1,86__m128i* const avg) {87// (a + b) >> 1 = ((a + b + 1) >> 1) - ((a ^ b) & 1)88const __m128i ones = _mm_set1_epi8(1);89const __m128i avg1 = _mm_avg_epu8(*a0, *a1);90const __m128i one = _mm_and_si128(_mm_xor_si128(*a0, *a1), ones);91*avg = _mm_sub_epi8(avg1, one);92}9394static WEBP_INLINE void Average2_uint32_SSE2(const uint32_t a0,95const uint32_t a1,96__m128i* const avg) {97// (a + b) >> 1 = ((a + b + 1) >> 1) - ((a ^ b) & 1)98const __m128i ones = _mm_set1_epi8(1);99const __m128i A0 = _mm_cvtsi32_si128((int)a0);100const __m128i A1 = _mm_cvtsi32_si128((int)a1);101const __m128i avg1 = _mm_avg_epu8(A0, A1);102const __m128i one = _mm_and_si128(_mm_xor_si128(A0, A1), ones);103*avg = _mm_sub_epi8(avg1, one);104}105106static WEBP_INLINE __m128i Average2_uint32_16_SSE2(uint32_t a0, uint32_t a1) {107const __m128i zero = _mm_setzero_si128();108const __m128i A0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128((int)a0), zero);109const __m128i A1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128((int)a1), zero);110const __m128i sum = _mm_add_epi16(A1, A0);111return _mm_srli_epi16(sum, 1);112}113114static WEBP_INLINE uint32_t Average2_SSE2(uint32_t a0, uint32_t a1) {115__m128i output;116Average2_uint32_SSE2(a0, a1, &output);117return (uint32_t)_mm_cvtsi128_si32(output);118}119120static WEBP_INLINE uint32_t Average3_SSE2(uint32_t a0, uint32_t a1,121uint32_t a2) {122const __m128i zero = _mm_setzero_si128();123const __m128i avg1 = Average2_uint32_16_SSE2(a0, a2);124const __m128i A1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128((int)a1), zero);125const __m128i sum = _mm_add_epi16(avg1, A1);126const __m128i avg2 = _mm_srli_epi16(sum, 1);127const __m128i A2 = _mm_packus_epi16(avg2, avg2);128return (uint32_t)_mm_cvtsi128_si32(A2);129}130131static WEBP_INLINE uint32_t Average4_SSE2(uint32_t a0, uint32_t a1,132uint32_t a2, uint32_t a3) {133const __m128i avg1 = Average2_uint32_16_SSE2(a0, a1);134const __m128i avg2 = Average2_uint32_16_SSE2(a2, a3);135const __m128i sum = _mm_add_epi16(avg2, avg1);136const __m128i avg3 = _mm_srli_epi16(sum, 1);137const __m128i A0 = _mm_packus_epi16(avg3, avg3);138return (uint32_t)_mm_cvtsi128_si32(A0);139}140141static uint32_t Predictor5_SSE2(const uint32_t* const left,142const uint32_t* const top) {143const uint32_t pred = Average3_SSE2(*left, top[0], top[1]);144return pred;145}146static uint32_t Predictor6_SSE2(const uint32_t* const left,147const uint32_t* const top) {148const uint32_t pred = Average2_SSE2(*left, top[-1]);149return pred;150}151static uint32_t Predictor7_SSE2(const uint32_t* const left,152const uint32_t* const top) {153const uint32_t pred = Average2_SSE2(*left, top[0]);154return pred;155}156static uint32_t Predictor8_SSE2(const uint32_t* const left,157const uint32_t* const top) {158const uint32_t pred = Average2_SSE2(top[-1], top[0]);159(void)left;160return pred;161}162static uint32_t Predictor9_SSE2(const uint32_t* const left,163const uint32_t* const top) {164const uint32_t pred = Average2_SSE2(top[0], top[1]);165(void)left;166return pred;167}168static uint32_t Predictor10_SSE2(const uint32_t* const left,169const uint32_t* const top) {170const uint32_t pred = Average4_SSE2(*left, top[-1], top[0], top[1]);171return pred;172}173static uint32_t Predictor11_SSE2(const uint32_t* const left,174const uint32_t* const top) {175const uint32_t pred = Select_SSE2(top[0], *left, top[-1]);176return pred;177}178static uint32_t Predictor12_SSE2(const uint32_t* const left,179const uint32_t* const top) {180const uint32_t pred = ClampedAddSubtractFull_SSE2(*left, top[0], top[-1]);181return pred;182}183static uint32_t Predictor13_SSE2(const uint32_t* const left,184const uint32_t* const top) {185const uint32_t pred = ClampedAddSubtractHalf_SSE2(*left, top[0], top[-1]);186return pred;187}188189// Batch versions of those functions.190191// Predictor0: ARGB_BLACK.192static void PredictorAdd0_SSE2(const uint32_t* in, const uint32_t* upper,193int num_pixels, uint32_t* WEBP_RESTRICT out) {194int i;195const __m128i black = _mm_set1_epi32((int)ARGB_BLACK);196for (i = 0; i + 4 <= num_pixels; i += 4) {197const __m128i src = _mm_loadu_si128((const __m128i*)&in[i]);198const __m128i res = _mm_add_epi8(src, black);199_mm_storeu_si128((__m128i*)&out[i], res);200}201if (i != num_pixels) {202VP8LPredictorsAdd_C[0](in + i, NULL, num_pixels - i, out + i);203}204(void)upper;205}206207// Predictor1: left.208static void PredictorAdd1_SSE2(const uint32_t* in, const uint32_t* upper,209int num_pixels, uint32_t* WEBP_RESTRICT out) {210int i;211__m128i prev = _mm_set1_epi32((int)out[-1]);212for (i = 0; i + 4 <= num_pixels; i += 4) {213// a | b | c | d214const __m128i src = _mm_loadu_si128((const __m128i*)&in[i]);215// 0 | a | b | c216const __m128i shift0 = _mm_slli_si128(src, 4);217// a | a + b | b + c | c + d218const __m128i sum0 = _mm_add_epi8(src, shift0);219// 0 | 0 | a | a + b220const __m128i shift1 = _mm_slli_si128(sum0, 8);221// a | a + b | a + b + c | a + b + c + d222const __m128i sum1 = _mm_add_epi8(sum0, shift1);223const __m128i res = _mm_add_epi8(sum1, prev);224_mm_storeu_si128((__m128i*)&out[i], res);225// replicate prev output on the four lanes226prev = _mm_shuffle_epi32(res, (3 << 0) | (3 << 2) | (3 << 4) | (3 << 6));227}228if (i != num_pixels) {229VP8LPredictorsAdd_C[1](in + i, upper + i, num_pixels - i, out + i);230}231}232233// Macro that adds 32-bit integers from IN using mod 256 arithmetic234// per 8 bit channel.235#define GENERATE_PREDICTOR_1(X, IN) \236static void PredictorAdd##X##_SSE2(const uint32_t* in, const uint32_t* upper, \237int num_pixels, \238uint32_t* WEBP_RESTRICT out) { \239int i; \240for (i = 0; i + 4 <= num_pixels; i += 4) { \241const __m128i src = _mm_loadu_si128((const __m128i*)&in[i]); \242const __m128i other = _mm_loadu_si128((const __m128i*)&(IN)); \243const __m128i res = _mm_add_epi8(src, other); \244_mm_storeu_si128((__m128i*)&out[i], res); \245} \246if (i != num_pixels) { \247VP8LPredictorsAdd_C[(X)](in + i, upper + i, num_pixels - i, out + i); \248} \249}250251// Predictor2: Top.252GENERATE_PREDICTOR_1(2, upper[i])253// Predictor3: Top-right.254GENERATE_PREDICTOR_1(3, upper[i + 1])255// Predictor4: Top-left.256GENERATE_PREDICTOR_1(4, upper[i - 1])257#undef GENERATE_PREDICTOR_1258259// Due to averages with integers, values cannot be accumulated in parallel for260// predictors 5 to 7.261GENERATE_PREDICTOR_ADD(Predictor5_SSE2, PredictorAdd5_SSE2)262GENERATE_PREDICTOR_ADD(Predictor6_SSE2, PredictorAdd6_SSE2)263GENERATE_PREDICTOR_ADD(Predictor7_SSE2, PredictorAdd7_SSE2)264265#define GENERATE_PREDICTOR_2(X, IN) \266static void PredictorAdd##X##_SSE2(const uint32_t* in, const uint32_t* upper, \267int num_pixels, \268uint32_t* WEBP_RESTRICT out) { \269int i; \270for (i = 0; i + 4 <= num_pixels; i += 4) { \271const __m128i Tother = _mm_loadu_si128((const __m128i*)&(IN)); \272const __m128i T = _mm_loadu_si128((const __m128i*)&upper[i]); \273const __m128i src = _mm_loadu_si128((const __m128i*)&in[i]); \274__m128i avg, res; \275Average2_m128i(&T, &Tother, &avg); \276res = _mm_add_epi8(avg, src); \277_mm_storeu_si128((__m128i*)&out[i], res); \278} \279if (i != num_pixels) { \280VP8LPredictorsAdd_C[(X)](in + i, upper + i, num_pixels - i, out + i); \281} \282}283// Predictor8: average TL T.284GENERATE_PREDICTOR_2(8, upper[i - 1])285// Predictor9: average T TR.286GENERATE_PREDICTOR_2(9, upper[i + 1])287#undef GENERATE_PREDICTOR_2288289// Predictor10: average of (average of (L,TL), average of (T, TR)).290#define DO_PRED10(OUT) do { \291__m128i avgLTL, avg; \292Average2_m128i(&L, &TL, &avgLTL); \293Average2_m128i(&avgTTR, &avgLTL, &avg); \294L = _mm_add_epi8(avg, src); \295out[i + (OUT)] = (uint32_t)_mm_cvtsi128_si32(L); \296} while (0)297298#define DO_PRED10_SHIFT do { \299/* Rotate the pre-computed values for the next iteration.*/ \300avgTTR = _mm_srli_si128(avgTTR, 4); \301TL = _mm_srli_si128(TL, 4); \302src = _mm_srli_si128(src, 4); \303} while (0)304305static void PredictorAdd10_SSE2(const uint32_t* in, const uint32_t* upper,306int num_pixels, uint32_t* WEBP_RESTRICT out) {307int i;308__m128i L = _mm_cvtsi32_si128((int)out[-1]);309for (i = 0; i + 4 <= num_pixels; i += 4) {310__m128i src = _mm_loadu_si128((const __m128i*)&in[i]);311__m128i TL = _mm_loadu_si128((const __m128i*)&upper[i - 1]);312const __m128i T = _mm_loadu_si128((const __m128i*)&upper[i]);313const __m128i TR = _mm_loadu_si128((const __m128i*)&upper[i + 1]);314__m128i avgTTR;315Average2_m128i(&T, &TR, &avgTTR);316DO_PRED10(0);317DO_PRED10_SHIFT;318DO_PRED10(1);319DO_PRED10_SHIFT;320DO_PRED10(2);321DO_PRED10_SHIFT;322DO_PRED10(3);323}324if (i != num_pixels) {325VP8LPredictorsAdd_C[10](in + i, upper + i, num_pixels - i, out + i);326}327}328#undef DO_PRED10329#undef DO_PRED10_SHIFT330331// Predictor11: select.332#define DO_PRED11(OUT) do { \333const __m128i L_lo = _mm_unpacklo_epi32(L, T); \334const __m128i TL_lo = _mm_unpacklo_epi32(TL, T); \335const __m128i pb = _mm_sad_epu8(L_lo, TL_lo); /* pb = sum |L-TL|*/ \336const __m128i mask = _mm_cmpgt_epi32(pb, pa); \337const __m128i A = _mm_and_si128(mask, L); \338const __m128i B = _mm_andnot_si128(mask, T); \339const __m128i pred = _mm_or_si128(A, B); /* pred = (pa > b)? L : T*/ \340L = _mm_add_epi8(src, pred); \341out[i + (OUT)] = (uint32_t)_mm_cvtsi128_si32(L); \342} while (0)343344#define DO_PRED11_SHIFT do { \345/* Shift the pre-computed value for the next iteration.*/ \346T = _mm_srli_si128(T, 4); \347TL = _mm_srli_si128(TL, 4); \348src = _mm_srli_si128(src, 4); \349pa = _mm_srli_si128(pa, 4); \350} while (0)351352static void PredictorAdd11_SSE2(const uint32_t* in, const uint32_t* upper,353int num_pixels, uint32_t* WEBP_RESTRICT out) {354int i;355__m128i pa;356__m128i L = _mm_cvtsi32_si128((int)out[-1]);357for (i = 0; i + 4 <= num_pixels; i += 4) {358__m128i T = _mm_loadu_si128((const __m128i*)&upper[i]);359__m128i TL = _mm_loadu_si128((const __m128i*)&upper[i - 1]);360__m128i src = _mm_loadu_si128((const __m128i*)&in[i]);361{362// We can unpack with any value on the upper 32 bits, provided it's the363// same on both operands (so that their sum of abs diff is zero). Here we364// use T.365const __m128i T_lo = _mm_unpacklo_epi32(T, T);366const __m128i TL_lo = _mm_unpacklo_epi32(TL, T);367const __m128i T_hi = _mm_unpackhi_epi32(T, T);368const __m128i TL_hi = _mm_unpackhi_epi32(TL, T);369const __m128i s_lo = _mm_sad_epu8(T_lo, TL_lo);370const __m128i s_hi = _mm_sad_epu8(T_hi, TL_hi);371pa = _mm_packs_epi32(s_lo, s_hi); // pa = sum |T-TL|372}373DO_PRED11(0);374DO_PRED11_SHIFT;375DO_PRED11(1);376DO_PRED11_SHIFT;377DO_PRED11(2);378DO_PRED11_SHIFT;379DO_PRED11(3);380}381if (i != num_pixels) {382VP8LPredictorsAdd_C[11](in + i, upper + i, num_pixels - i, out + i);383}384}385#undef DO_PRED11386#undef DO_PRED11_SHIFT387388// Predictor12: ClampedAddSubtractFull.389#define DO_PRED12(DIFF, LANE, OUT) do { \390const __m128i all = _mm_add_epi16(L, (DIFF)); \391const __m128i alls = _mm_packus_epi16(all, all); \392const __m128i res = _mm_add_epi8(src, alls); \393out[i + (OUT)] = (uint32_t)_mm_cvtsi128_si32(res); \394L = _mm_unpacklo_epi8(res, zero); \395} while (0)396397#define DO_PRED12_SHIFT(DIFF, LANE) do { \398/* Shift the pre-computed value for the next iteration.*/ \399if ((LANE) == 0) (DIFF) = _mm_srli_si128((DIFF), 8); \400src = _mm_srli_si128(src, 4); \401} while (0)402403static void PredictorAdd12_SSE2(const uint32_t* in, const uint32_t* upper,404int num_pixels, uint32_t* WEBP_RESTRICT out) {405int i;406const __m128i zero = _mm_setzero_si128();407const __m128i L8 = _mm_cvtsi32_si128((int)out[-1]);408__m128i L = _mm_unpacklo_epi8(L8, zero);409for (i = 0; i + 4 <= num_pixels; i += 4) {410// Load 4 pixels at a time.411__m128i src = _mm_loadu_si128((const __m128i*)&in[i]);412const __m128i T = _mm_loadu_si128((const __m128i*)&upper[i]);413const __m128i T_lo = _mm_unpacklo_epi8(T, zero);414const __m128i T_hi = _mm_unpackhi_epi8(T, zero);415const __m128i TL = _mm_loadu_si128((const __m128i*)&upper[i - 1]);416const __m128i TL_lo = _mm_unpacklo_epi8(TL, zero);417const __m128i TL_hi = _mm_unpackhi_epi8(TL, zero);418__m128i diff_lo = _mm_sub_epi16(T_lo, TL_lo);419__m128i diff_hi = _mm_sub_epi16(T_hi, TL_hi);420DO_PRED12(diff_lo, 0, 0);421DO_PRED12_SHIFT(diff_lo, 0);422DO_PRED12(diff_lo, 1, 1);423DO_PRED12_SHIFT(diff_lo, 1);424DO_PRED12(diff_hi, 0, 2);425DO_PRED12_SHIFT(diff_hi, 0);426DO_PRED12(diff_hi, 1, 3);427}428if (i != num_pixels) {429VP8LPredictorsAdd_C[12](in + i, upper + i, num_pixels - i, out + i);430}431}432#undef DO_PRED12433#undef DO_PRED12_SHIFT434435// Due to averages with integers, values cannot be accumulated in parallel for436// predictors 13.437GENERATE_PREDICTOR_ADD(Predictor13_SSE2, PredictorAdd13_SSE2)438439//------------------------------------------------------------------------------440// Subtract-Green Transform441442static void AddGreenToBlueAndRed_SSE2(const uint32_t* const src, int num_pixels,443uint32_t* dst) {444int i;445for (i = 0; i + 4 <= num_pixels; i += 4) {446const __m128i in = _mm_loadu_si128((const __m128i*)&src[i]); // argb447const __m128i A = _mm_srli_epi16(in, 8); // 0 a 0 g448const __m128i B = _mm_shufflelo_epi16(A, _MM_SHUFFLE(2, 2, 0, 0));449const __m128i C = _mm_shufflehi_epi16(B, _MM_SHUFFLE(2, 2, 0, 0)); // 0g0g450const __m128i out = _mm_add_epi8(in, C);451_mm_storeu_si128((__m128i*)&dst[i], out);452}453// fallthrough and finish off with plain-C454if (i != num_pixels) {455VP8LAddGreenToBlueAndRed_C(src + i, num_pixels - i, dst + i);456}457}458459//------------------------------------------------------------------------------460// Color Transform461462static void TransformColorInverse_SSE2(const VP8LMultipliers* const m,463const uint32_t* const src,464int num_pixels, uint32_t* dst) {465// sign-extended multiplying constants, pre-shifted by 5.466#define CST(X) (((int16_t)(m->X << 8)) >> 5) // sign-extend467#define MK_CST_16(HI, LO) \468_mm_set1_epi32((int)(((uint32_t)(HI) << 16) | ((LO) & 0xffff)))469const __m128i mults_rb = MK_CST_16(CST(green_to_red), CST(green_to_blue));470const __m128i mults_b2 = MK_CST_16(CST(red_to_blue), 0);471#undef MK_CST_16472#undef CST473const __m128i mask_ag = _mm_set1_epi32((int)0xff00ff00); // alpha-green masks474int i;475for (i = 0; i + 4 <= num_pixels; i += 4) {476const __m128i in = _mm_loadu_si128((const __m128i*)&src[i]); // argb477const __m128i A = _mm_and_si128(in, mask_ag); // a 0 g 0478const __m128i B = _mm_shufflelo_epi16(A, _MM_SHUFFLE(2, 2, 0, 0));479const __m128i C = _mm_shufflehi_epi16(B, _MM_SHUFFLE(2, 2, 0, 0)); // g0g0480const __m128i D = _mm_mulhi_epi16(C, mults_rb); // x dr x db1481const __m128i E = _mm_add_epi8(in, D); // x r' x b'482const __m128i F = _mm_slli_epi16(E, 8); // r' 0 b' 0483const __m128i G = _mm_mulhi_epi16(F, mults_b2); // x db2 0 0484const __m128i H = _mm_srli_epi32(G, 8); // 0 x db2 0485const __m128i I = _mm_add_epi8(H, F); // r' x b'' 0486const __m128i J = _mm_srli_epi16(I, 8); // 0 r' 0 b''487const __m128i out = _mm_or_si128(J, A);488_mm_storeu_si128((__m128i*)&dst[i], out);489}490// Fall-back to C-version for left-overs.491if (i != num_pixels) {492VP8LTransformColorInverse_C(m, src + i, num_pixels - i, dst + i);493}494}495496//------------------------------------------------------------------------------497// Color-space conversion functions498499static void ConvertBGRAToRGB_SSE2(const uint32_t* WEBP_RESTRICT src,500int num_pixels, uint8_t* WEBP_RESTRICT dst) {501const __m128i* in = (const __m128i*)src;502__m128i* out = (__m128i*)dst;503504while (num_pixels >= 32) {505// Load the BGRA buffers.506__m128i in0 = _mm_loadu_si128(in + 0);507__m128i in1 = _mm_loadu_si128(in + 1);508__m128i in2 = _mm_loadu_si128(in + 2);509__m128i in3 = _mm_loadu_si128(in + 3);510__m128i in4 = _mm_loadu_si128(in + 4);511__m128i in5 = _mm_loadu_si128(in + 5);512__m128i in6 = _mm_loadu_si128(in + 6);513__m128i in7 = _mm_loadu_si128(in + 7);514VP8L32bToPlanar_SSE2(&in0, &in1, &in2, &in3);515VP8L32bToPlanar_SSE2(&in4, &in5, &in6, &in7);516// At this points, in1/in5 contains red only, in2/in6 green only ...517// Pack the colors in 24b RGB.518VP8PlanarTo24b_SSE2(&in1, &in5, &in2, &in6, &in3, &in7);519_mm_storeu_si128(out + 0, in1);520_mm_storeu_si128(out + 1, in5);521_mm_storeu_si128(out + 2, in2);522_mm_storeu_si128(out + 3, in6);523_mm_storeu_si128(out + 4, in3);524_mm_storeu_si128(out + 5, in7);525in += 8;526out += 6;527num_pixels -= 32;528}529// left-overs530if (num_pixels > 0) {531VP8LConvertBGRAToRGB_C((const uint32_t*)in, num_pixels, (uint8_t*)out);532}533}534535static void ConvertBGRAToRGBA_SSE2(const uint32_t* WEBP_RESTRICT src,536int num_pixels, uint8_t* WEBP_RESTRICT dst) {537const __m128i red_blue_mask = _mm_set1_epi32(0x00ff00ff);538const __m128i* in = (const __m128i*)src;539__m128i* out = (__m128i*)dst;540while (num_pixels >= 8) {541const __m128i A1 = _mm_loadu_si128(in++);542const __m128i A2 = _mm_loadu_si128(in++);543const __m128i B1 = _mm_and_si128(A1, red_blue_mask); // R 0 B 0544const __m128i B2 = _mm_and_si128(A2, red_blue_mask); // R 0 B 0545const __m128i C1 = _mm_andnot_si128(red_blue_mask, A1); // 0 G 0 A546const __m128i C2 = _mm_andnot_si128(red_blue_mask, A2); // 0 G 0 A547const __m128i D1 = _mm_shufflelo_epi16(B1, _MM_SHUFFLE(2, 3, 0, 1));548const __m128i D2 = _mm_shufflelo_epi16(B2, _MM_SHUFFLE(2, 3, 0, 1));549const __m128i E1 = _mm_shufflehi_epi16(D1, _MM_SHUFFLE(2, 3, 0, 1));550const __m128i E2 = _mm_shufflehi_epi16(D2, _MM_SHUFFLE(2, 3, 0, 1));551const __m128i F1 = _mm_or_si128(E1, C1);552const __m128i F2 = _mm_or_si128(E2, C2);553_mm_storeu_si128(out++, F1);554_mm_storeu_si128(out++, F2);555num_pixels -= 8;556}557// left-overs558if (num_pixels > 0) {559VP8LConvertBGRAToRGBA_C((const uint32_t*)in, num_pixels, (uint8_t*)out);560}561}562563static void ConvertBGRAToRGBA4444_SSE2(const uint32_t* WEBP_RESTRICT src,564int num_pixels,565uint8_t* WEBP_RESTRICT dst) {566const __m128i mask_0x0f = _mm_set1_epi8(0x0f);567const __m128i mask_0xf0 = _mm_set1_epi8((char)0xf0);568const __m128i* in = (const __m128i*)src;569__m128i* out = (__m128i*)dst;570while (num_pixels >= 8) {571const __m128i bgra0 = _mm_loadu_si128(in++); // bgra0|bgra1|bgra2|bgra3572const __m128i bgra4 = _mm_loadu_si128(in++); // bgra4|bgra5|bgra6|bgra7573const __m128i v0l = _mm_unpacklo_epi8(bgra0, bgra4); // b0b4g0g4r0r4a0a4...574const __m128i v0h = _mm_unpackhi_epi8(bgra0, bgra4); // b2b6g2g6r2r6a2a6...575const __m128i v1l = _mm_unpacklo_epi8(v0l, v0h); // b0b2b4b6g0g2g4g6...576const __m128i v1h = _mm_unpackhi_epi8(v0l, v0h); // b1b3b5b7g1g3g5g7...577const __m128i v2l = _mm_unpacklo_epi8(v1l, v1h); // b0...b7 | g0...g7578const __m128i v2h = _mm_unpackhi_epi8(v1l, v1h); // r0...r7 | a0...a7579const __m128i ga0 = _mm_unpackhi_epi64(v2l, v2h); // g0...g7 | a0...a7580const __m128i rb0 = _mm_unpacklo_epi64(v2h, v2l); // r0...r7 | b0...b7581const __m128i ga1 = _mm_srli_epi16(ga0, 4); // g0-|g1-|...|a6-|a7-582const __m128i rb1 = _mm_and_si128(rb0, mask_0xf0); // -r0|-r1|...|-b6|-a7583const __m128i ga2 = _mm_and_si128(ga1, mask_0x0f); // g0-|g1-|...|a6-|a7-584const __m128i rgba0 = _mm_or_si128(ga2, rb1); // rg0..rg7 | ba0..ba7585const __m128i rgba1 = _mm_srli_si128(rgba0, 8); // ba0..ba7 | 0586#if (WEBP_SWAP_16BIT_CSP == 1)587const __m128i rgba = _mm_unpacklo_epi8(rgba1, rgba0); // barg0...barg7588#else589const __m128i rgba = _mm_unpacklo_epi8(rgba0, rgba1); // rgba0...rgba7590#endif591_mm_storeu_si128(out++, rgba);592num_pixels -= 8;593}594// left-overs595if (num_pixels > 0) {596VP8LConvertBGRAToRGBA4444_C((const uint32_t*)in, num_pixels, (uint8_t*)out);597}598}599600static void ConvertBGRAToRGB565_SSE2(const uint32_t* WEBP_RESTRICT src,601int num_pixels,602uint8_t* WEBP_RESTRICT dst) {603const __m128i mask_0xe0 = _mm_set1_epi8((char)0xe0);604const __m128i mask_0xf8 = _mm_set1_epi8((char)0xf8);605const __m128i mask_0x07 = _mm_set1_epi8(0x07);606const __m128i* in = (const __m128i*)src;607__m128i* out = (__m128i*)dst;608while (num_pixels >= 8) {609const __m128i bgra0 = _mm_loadu_si128(in++); // bgra0|bgra1|bgra2|bgra3610const __m128i bgra4 = _mm_loadu_si128(in++); // bgra4|bgra5|bgra6|bgra7611const __m128i v0l = _mm_unpacklo_epi8(bgra0, bgra4); // b0b4g0g4r0r4a0a4...612const __m128i v0h = _mm_unpackhi_epi8(bgra0, bgra4); // b2b6g2g6r2r6a2a6...613const __m128i v1l = _mm_unpacklo_epi8(v0l, v0h); // b0b2b4b6g0g2g4g6...614const __m128i v1h = _mm_unpackhi_epi8(v0l, v0h); // b1b3b5b7g1g3g5g7...615const __m128i v2l = _mm_unpacklo_epi8(v1l, v1h); // b0...b7 | g0...g7616const __m128i v2h = _mm_unpackhi_epi8(v1l, v1h); // r0...r7 | a0...a7617const __m128i ga0 = _mm_unpackhi_epi64(v2l, v2h); // g0...g7 | a0...a7618const __m128i rb0 = _mm_unpacklo_epi64(v2h, v2l); // r0...r7 | b0...b7619const __m128i rb1 = _mm_and_si128(rb0, mask_0xf8); // -r0..-r7|-b0..-b7620const __m128i g_lo1 = _mm_srli_epi16(ga0, 5);621const __m128i g_lo2 = _mm_and_si128(g_lo1, mask_0x07); // g0-...g7-|xx (3b)622const __m128i g_hi1 = _mm_slli_epi16(ga0, 3);623const __m128i g_hi2 = _mm_and_si128(g_hi1, mask_0xe0); // -g0...-g7|xx (3b)624const __m128i b0 = _mm_srli_si128(rb1, 8); // -b0...-b7|0625const __m128i rg1 = _mm_or_si128(rb1, g_lo2); // gr0...gr7|xx626const __m128i b1 = _mm_srli_epi16(b0, 3);627const __m128i gb1 = _mm_or_si128(b1, g_hi2); // bg0...bg7|xx628#if (WEBP_SWAP_16BIT_CSP == 1)629const __m128i rgba = _mm_unpacklo_epi8(gb1, rg1); // rggb0...rggb7630#else631const __m128i rgba = _mm_unpacklo_epi8(rg1, gb1); // bgrb0...bgrb7632#endif633_mm_storeu_si128(out++, rgba);634num_pixels -= 8;635}636// left-overs637if (num_pixels > 0) {638VP8LConvertBGRAToRGB565_C((const uint32_t*)in, num_pixels, (uint8_t*)out);639}640}641642static void ConvertBGRAToBGR_SSE2(const uint32_t* WEBP_RESTRICT src,643int num_pixels, uint8_t* WEBP_RESTRICT dst) {644const __m128i mask_l = _mm_set_epi32(0, 0x00ffffff, 0, 0x00ffffff);645const __m128i mask_h = _mm_set_epi32(0x00ffffff, 0, 0x00ffffff, 0);646const __m128i* in = (const __m128i*)src;647const uint8_t* const end = dst + num_pixels * 3;648// the last storel_epi64 below writes 8 bytes starting at offset 18649while (dst + 26 <= end) {650const __m128i bgra0 = _mm_loadu_si128(in++); // bgra0|bgra1|bgra2|bgra3651const __m128i bgra4 = _mm_loadu_si128(in++); // bgra4|bgra5|bgra6|bgra7652const __m128i a0l = _mm_and_si128(bgra0, mask_l); // bgr0|0|bgr0|0653const __m128i a4l = _mm_and_si128(bgra4, mask_l); // bgr0|0|bgr0|0654const __m128i a0h = _mm_and_si128(bgra0, mask_h); // 0|bgr0|0|bgr0655const __m128i a4h = _mm_and_si128(bgra4, mask_h); // 0|bgr0|0|bgr0656const __m128i b0h = _mm_srli_epi64(a0h, 8); // 000b|gr00|000b|gr00657const __m128i b4h = _mm_srli_epi64(a4h, 8); // 000b|gr00|000b|gr00658const __m128i c0 = _mm_or_si128(a0l, b0h); // rgbrgb00|rgbrgb00659const __m128i c4 = _mm_or_si128(a4l, b4h); // rgbrgb00|rgbrgb00660const __m128i c2 = _mm_srli_si128(c0, 8);661const __m128i c6 = _mm_srli_si128(c4, 8);662_mm_storel_epi64((__m128i*)(dst + 0), c0);663_mm_storel_epi64((__m128i*)(dst + 6), c2);664_mm_storel_epi64((__m128i*)(dst + 12), c4);665_mm_storel_epi64((__m128i*)(dst + 18), c6);666dst += 24;667num_pixels -= 8;668}669// left-overs670if (num_pixels > 0) {671VP8LConvertBGRAToBGR_C((const uint32_t*)in, num_pixels, dst);672}673}674675//------------------------------------------------------------------------------676// Entry point677678extern void VP8LDspInitSSE2(void);679680WEBP_TSAN_IGNORE_FUNCTION void VP8LDspInitSSE2(void) {681VP8LPredictors[5] = Predictor5_SSE2;682VP8LPredictors[6] = Predictor6_SSE2;683VP8LPredictors[7] = Predictor7_SSE2;684VP8LPredictors[8] = Predictor8_SSE2;685VP8LPredictors[9] = Predictor9_SSE2;686VP8LPredictors[10] = Predictor10_SSE2;687VP8LPredictors[11] = Predictor11_SSE2;688VP8LPredictors[12] = Predictor12_SSE2;689VP8LPredictors[13] = Predictor13_SSE2;690691VP8LPredictorsAdd[0] = PredictorAdd0_SSE2;692VP8LPredictorsAdd[1] = PredictorAdd1_SSE2;693VP8LPredictorsAdd[2] = PredictorAdd2_SSE2;694VP8LPredictorsAdd[3] = PredictorAdd3_SSE2;695VP8LPredictorsAdd[4] = PredictorAdd4_SSE2;696VP8LPredictorsAdd[5] = PredictorAdd5_SSE2;697VP8LPredictorsAdd[6] = PredictorAdd6_SSE2;698VP8LPredictorsAdd[7] = PredictorAdd7_SSE2;699VP8LPredictorsAdd[8] = PredictorAdd8_SSE2;700VP8LPredictorsAdd[9] = PredictorAdd9_SSE2;701VP8LPredictorsAdd[10] = PredictorAdd10_SSE2;702VP8LPredictorsAdd[11] = PredictorAdd11_SSE2;703VP8LPredictorsAdd[12] = PredictorAdd12_SSE2;704VP8LPredictorsAdd[13] = PredictorAdd13_SSE2;705706VP8LAddGreenToBlueAndRed = AddGreenToBlueAndRed_SSE2;707VP8LTransformColorInverse = TransformColorInverse_SSE2;708709VP8LConvertBGRAToRGB = ConvertBGRAToRGB_SSE2;710VP8LConvertBGRAToRGBA = ConvertBGRAToRGBA_SSE2;711VP8LConvertBGRAToRGBA4444 = ConvertBGRAToRGBA4444_SSE2;712VP8LConvertBGRAToRGB565 = ConvertBGRAToRGB565_SSE2;713VP8LConvertBGRAToBGR = ConvertBGRAToBGR_SSE2;714715// SSE exports for AVX and above.716memcpy(VP8LPredictorsAdd_SSE, VP8LPredictorsAdd, sizeof(VP8LPredictorsAdd));717718VP8LAddGreenToBlueAndRed_SSE = AddGreenToBlueAndRed_SSE2;719VP8LTransformColorInverse_SSE = TransformColorInverse_SSE2;720721VP8LConvertBGRAToRGB_SSE = ConvertBGRAToRGB_SSE2;722VP8LConvertBGRAToRGBA_SSE = ConvertBGRAToRGBA_SSE2;723}724725#else // !WEBP_USE_SSE2726727WEBP_DSP_INIT_STUB(VP8LDspInitSSE2)728729#endif // WEBP_USE_SSE2730731732