Path: blob/master/3rdparty/libwebp/src/dsp/lossless_sse2.c
16348 views
// Copyright 2014 Google Inc. All Rights Reserved.1//2// Use of this source code is governed by a BSD-style license3// that can be found in the COPYING file in the root of the source4// tree. An additional intellectual property rights grant can be found5// in the file PATENTS. All contributing project authors may6// be found in the AUTHORS file in the root of the source tree.7// -----------------------------------------------------------------------------8//9// SSE2 variant of methods for lossless decoder10//11// Author: Skal ([email protected])1213#include "src/dsp/dsp.h"1415#if defined(WEBP_USE_SSE2)1617#include "src/dsp/common_sse2.h"18#include "src/dsp/lossless.h"19#include "src/dsp/lossless_common.h"20#include <assert.h>21#include <emmintrin.h>2223//------------------------------------------------------------------------------24// Predictor Transform2526static WEBP_INLINE uint32_t ClampedAddSubtractFull_SSE2(uint32_t c0,27uint32_t c1,28uint32_t c2) {29const __m128i zero = _mm_setzero_si128();30const __m128i C0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(c0), zero);31const __m128i C1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(c1), zero);32const __m128i C2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(c2), zero);33const __m128i V1 = _mm_add_epi16(C0, C1);34const __m128i V2 = _mm_sub_epi16(V1, C2);35const __m128i b = _mm_packus_epi16(V2, V2);36const uint32_t output = _mm_cvtsi128_si32(b);37return output;38}3940static WEBP_INLINE uint32_t ClampedAddSubtractHalf_SSE2(uint32_t c0,41uint32_t c1,42uint32_t c2) {43const __m128i zero = _mm_setzero_si128();44const __m128i C0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(c0), zero);45const __m128i C1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(c1), zero);46const __m128i B0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(c2), zero);47const __m128i avg = _mm_add_epi16(C1, C0);48const __m128i A0 = _mm_srli_epi16(avg, 1);49const __m128i A1 = _mm_sub_epi16(A0, B0);50const __m128i BgtA = _mm_cmpgt_epi16(B0, A0);51const __m128i A2 = _mm_sub_epi16(A1, BgtA);52const __m128i A3 = _mm_srai_epi16(A2, 1);53const __m128i A4 = _mm_add_epi16(A0, A3);54const __m128i A5 = _mm_packus_epi16(A4, A4);55const uint32_t output = _mm_cvtsi128_si32(A5);56return output;57}5859static WEBP_INLINE uint32_t Select_SSE2(uint32_t a, uint32_t b, uint32_t c) {60int pa_minus_pb;61const __m128i zero = _mm_setzero_si128();62const __m128i A0 = _mm_cvtsi32_si128(a);63const __m128i B0 = _mm_cvtsi32_si128(b);64const __m128i C0 = _mm_cvtsi32_si128(c);65const __m128i AC0 = _mm_subs_epu8(A0, C0);66const __m128i CA0 = _mm_subs_epu8(C0, A0);67const __m128i BC0 = _mm_subs_epu8(B0, C0);68const __m128i CB0 = _mm_subs_epu8(C0, B0);69const __m128i AC = _mm_or_si128(AC0, CA0);70const __m128i BC = _mm_or_si128(BC0, CB0);71const __m128i pa = _mm_unpacklo_epi8(AC, zero); // |a - c|72const __m128i pb = _mm_unpacklo_epi8(BC, zero); // |b - c|73const __m128i diff = _mm_sub_epi16(pb, pa);74{75int16_t out[8];76_mm_storeu_si128((__m128i*)out, diff);77pa_minus_pb = out[0] + out[1] + out[2] + out[3];78}79return (pa_minus_pb <= 0) ? a : b;80}8182static WEBP_INLINE void Average2_m128i(const __m128i* const a0,83const __m128i* const a1,84__m128i* const avg) {85// (a + b) >> 1 = ((a + b + 1) >> 1) - ((a ^ b) & 1)86const __m128i ones = _mm_set1_epi8(1);87const __m128i avg1 = _mm_avg_epu8(*a0, *a1);88const __m128i one = _mm_and_si128(_mm_xor_si128(*a0, *a1), ones);89*avg = _mm_sub_epi8(avg1, one);90}9192static WEBP_INLINE void Average2_uint32_SSE2(const uint32_t a0,93const uint32_t a1,94__m128i* const avg) {95// (a + b) >> 1 = ((a + b + 1) >> 1) - ((a ^ b) & 1)96const __m128i ones = _mm_set1_epi8(1);97const __m128i A0 = _mm_cvtsi32_si128(a0);98const __m128i A1 = _mm_cvtsi32_si128(a1);99const __m128i avg1 = _mm_avg_epu8(A0, A1);100const __m128i one = _mm_and_si128(_mm_xor_si128(A0, A1), ones);101*avg = _mm_sub_epi8(avg1, one);102}103104static WEBP_INLINE __m128i Average2_uint32_16_SSE2(uint32_t a0, uint32_t a1) {105const __m128i zero = _mm_setzero_si128();106const __m128i A0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(a0), zero);107const __m128i A1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(a1), zero);108const __m128i sum = _mm_add_epi16(A1, A0);109return _mm_srli_epi16(sum, 1);110}111112static WEBP_INLINE uint32_t Average2_SSE2(uint32_t a0, uint32_t a1) {113__m128i output;114Average2_uint32_SSE2(a0, a1, &output);115return _mm_cvtsi128_si32(output);116}117118static WEBP_INLINE uint32_t Average3_SSE2(uint32_t a0, uint32_t a1,119uint32_t a2) {120const __m128i zero = _mm_setzero_si128();121const __m128i avg1 = Average2_uint32_16_SSE2(a0, a2);122const __m128i A1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(a1), zero);123const __m128i sum = _mm_add_epi16(avg1, A1);124const __m128i avg2 = _mm_srli_epi16(sum, 1);125const __m128i A2 = _mm_packus_epi16(avg2, avg2);126const uint32_t output = _mm_cvtsi128_si32(A2);127return output;128}129130static WEBP_INLINE uint32_t Average4_SSE2(uint32_t a0, uint32_t a1,131uint32_t a2, uint32_t a3) {132const __m128i avg1 = Average2_uint32_16_SSE2(a0, a1);133const __m128i avg2 = Average2_uint32_16_SSE2(a2, a3);134const __m128i sum = _mm_add_epi16(avg2, avg1);135const __m128i avg3 = _mm_srli_epi16(sum, 1);136const __m128i A0 = _mm_packus_epi16(avg3, avg3);137const uint32_t output = _mm_cvtsi128_si32(A0);138return output;139}140141static uint32_t Predictor5_SSE2(uint32_t left, const uint32_t* const top) {142const uint32_t pred = Average3_SSE2(left, top[0], top[1]);143return pred;144}145static uint32_t Predictor6_SSE2(uint32_t left, const uint32_t* const top) {146const uint32_t pred = Average2_SSE2(left, top[-1]);147return pred;148}149static uint32_t Predictor7_SSE2(uint32_t left, const uint32_t* const top) {150const uint32_t pred = Average2_SSE2(left, top[0]);151return pred;152}153static uint32_t Predictor8_SSE2(uint32_t left, const uint32_t* const top) {154const uint32_t pred = Average2_SSE2(top[-1], top[0]);155(void)left;156return pred;157}158static uint32_t Predictor9_SSE2(uint32_t left, const uint32_t* const top) {159const uint32_t pred = Average2_SSE2(top[0], top[1]);160(void)left;161return pred;162}163static uint32_t Predictor10_SSE2(uint32_t left, const uint32_t* const top) {164const uint32_t pred = Average4_SSE2(left, top[-1], top[0], top[1]);165return pred;166}167static uint32_t Predictor11_SSE2(uint32_t left, const uint32_t* const top) {168const uint32_t pred = Select_SSE2(top[0], left, top[-1]);169return pred;170}171static uint32_t Predictor12_SSE2(uint32_t left, const uint32_t* const top) {172const uint32_t pred = ClampedAddSubtractFull_SSE2(left, top[0], top[-1]);173return pred;174}175static uint32_t Predictor13_SSE2(uint32_t left, const uint32_t* const top) {176const uint32_t pred = ClampedAddSubtractHalf_SSE2(left, top[0], top[-1]);177return pred;178}179180// Batch versions of those functions.181182// Predictor0: ARGB_BLACK.183static void PredictorAdd0_SSE2(const uint32_t* in, const uint32_t* upper,184int num_pixels, uint32_t* out) {185int i;186const __m128i black = _mm_set1_epi32(ARGB_BLACK);187for (i = 0; i + 4 <= num_pixels; i += 4) {188const __m128i src = _mm_loadu_si128((const __m128i*)&in[i]);189const __m128i res = _mm_add_epi8(src, black);190_mm_storeu_si128((__m128i*)&out[i], res);191}192if (i != num_pixels) {193VP8LPredictorsAdd_C[0](in + i, upper + i, num_pixels - i, out + i);194}195}196197// Predictor1: left.198static void PredictorAdd1_SSE2(const uint32_t* in, const uint32_t* upper,199int num_pixels, uint32_t* out) {200int i;201__m128i prev = _mm_set1_epi32(out[-1]);202for (i = 0; i + 4 <= num_pixels; i += 4) {203// a | b | c | d204const __m128i src = _mm_loadu_si128((const __m128i*)&in[i]);205// 0 | a | b | c206const __m128i shift0 = _mm_slli_si128(src, 4);207// a | a + b | b + c | c + d208const __m128i sum0 = _mm_add_epi8(src, shift0);209// 0 | 0 | a | a + b210const __m128i shift1 = _mm_slli_si128(sum0, 8);211// a | a + b | a + b + c | a + b + c + d212const __m128i sum1 = _mm_add_epi8(sum0, shift1);213const __m128i res = _mm_add_epi8(sum1, prev);214_mm_storeu_si128((__m128i*)&out[i], res);215// replicate prev output on the four lanes216prev = _mm_shuffle_epi32(res, (3 << 0) | (3 << 2) | (3 << 4) | (3 << 6));217}218if (i != num_pixels) {219VP8LPredictorsAdd_C[1](in + i, upper + i, num_pixels - i, out + i);220}221}222223// Macro that adds 32-bit integers from IN using mod 256 arithmetic224// per 8 bit channel.225#define GENERATE_PREDICTOR_1(X, IN) \226static void PredictorAdd##X##_SSE2(const uint32_t* in, const uint32_t* upper, \227int num_pixels, uint32_t* out) { \228int i; \229for (i = 0; i + 4 <= num_pixels; i += 4) { \230const __m128i src = _mm_loadu_si128((const __m128i*)&in[i]); \231const __m128i other = _mm_loadu_si128((const __m128i*)&(IN)); \232const __m128i res = _mm_add_epi8(src, other); \233_mm_storeu_si128((__m128i*)&out[i], res); \234} \235if (i != num_pixels) { \236VP8LPredictorsAdd_C[(X)](in + i, upper + i, num_pixels - i, out + i); \237} \238}239240// Predictor2: Top.241GENERATE_PREDICTOR_1(2, upper[i])242// Predictor3: Top-right.243GENERATE_PREDICTOR_1(3, upper[i + 1])244// Predictor4: Top-left.245GENERATE_PREDICTOR_1(4, upper[i - 1])246#undef GENERATE_PREDICTOR_1247248// Due to averages with integers, values cannot be accumulated in parallel for249// predictors 5 to 7.250GENERATE_PREDICTOR_ADD(Predictor5_SSE2, PredictorAdd5_SSE2)251GENERATE_PREDICTOR_ADD(Predictor6_SSE2, PredictorAdd6_SSE2)252GENERATE_PREDICTOR_ADD(Predictor7_SSE2, PredictorAdd7_SSE2)253254#define GENERATE_PREDICTOR_2(X, IN) \255static void PredictorAdd##X##_SSE2(const uint32_t* in, const uint32_t* upper, \256int num_pixels, uint32_t* out) { \257int i; \258for (i = 0; i + 4 <= num_pixels; i += 4) { \259const __m128i Tother = _mm_loadu_si128((const __m128i*)&(IN)); \260const __m128i T = _mm_loadu_si128((const __m128i*)&upper[i]); \261const __m128i src = _mm_loadu_si128((const __m128i*)&in[i]); \262__m128i avg, res; \263Average2_m128i(&T, &Tother, &avg); \264res = _mm_add_epi8(avg, src); \265_mm_storeu_si128((__m128i*)&out[i], res); \266} \267if (i != num_pixels) { \268VP8LPredictorsAdd_C[(X)](in + i, upper + i, num_pixels - i, out + i); \269} \270}271// Predictor8: average TL T.272GENERATE_PREDICTOR_2(8, upper[i - 1])273// Predictor9: average T TR.274GENERATE_PREDICTOR_2(9, upper[i + 1])275#undef GENERATE_PREDICTOR_2276277// Predictor10: average of (average of (L,TL), average of (T, TR)).278#define DO_PRED10(OUT) do { \279__m128i avgLTL, avg; \280Average2_m128i(&L, &TL, &avgLTL); \281Average2_m128i(&avgTTR, &avgLTL, &avg); \282L = _mm_add_epi8(avg, src); \283out[i + (OUT)] = _mm_cvtsi128_si32(L); \284} while (0)285286#define DO_PRED10_SHIFT do { \287/* Rotate the pre-computed values for the next iteration.*/ \288avgTTR = _mm_srli_si128(avgTTR, 4); \289TL = _mm_srli_si128(TL, 4); \290src = _mm_srli_si128(src, 4); \291} while (0)292293static void PredictorAdd10_SSE2(const uint32_t* in, const uint32_t* upper,294int num_pixels, uint32_t* out) {295int i;296__m128i L = _mm_cvtsi32_si128(out[-1]);297for (i = 0; i + 4 <= num_pixels; i += 4) {298__m128i src = _mm_loadu_si128((const __m128i*)&in[i]);299__m128i TL = _mm_loadu_si128((const __m128i*)&upper[i - 1]);300const __m128i T = _mm_loadu_si128((const __m128i*)&upper[i]);301const __m128i TR = _mm_loadu_si128((const __m128i*)&upper[i + 1]);302__m128i avgTTR;303Average2_m128i(&T, &TR, &avgTTR);304DO_PRED10(0);305DO_PRED10_SHIFT;306DO_PRED10(1);307DO_PRED10_SHIFT;308DO_PRED10(2);309DO_PRED10_SHIFT;310DO_PRED10(3);311}312if (i != num_pixels) {313VP8LPredictorsAdd_C[10](in + i, upper + i, num_pixels - i, out + i);314}315}316#undef DO_PRED10317#undef DO_PRED10_SHIFT318319// Predictor11: select.320#define DO_PRED11(OUT) do { \321const __m128i L_lo = _mm_unpacklo_epi32(L, T); \322const __m128i TL_lo = _mm_unpacklo_epi32(TL, T); \323const __m128i pb = _mm_sad_epu8(L_lo, TL_lo); /* pb = sum |L-TL|*/ \324const __m128i mask = _mm_cmpgt_epi32(pb, pa); \325const __m128i A = _mm_and_si128(mask, L); \326const __m128i B = _mm_andnot_si128(mask, T); \327const __m128i pred = _mm_or_si128(A, B); /* pred = (pa > b)? L : T*/ \328L = _mm_add_epi8(src, pred); \329out[i + (OUT)] = _mm_cvtsi128_si32(L); \330} while (0)331332#define DO_PRED11_SHIFT do { \333/* Shift the pre-computed value for the next iteration.*/ \334T = _mm_srli_si128(T, 4); \335TL = _mm_srli_si128(TL, 4); \336src = _mm_srli_si128(src, 4); \337pa = _mm_srli_si128(pa, 4); \338} while (0)339340static void PredictorAdd11_SSE2(const uint32_t* in, const uint32_t* upper,341int num_pixels, uint32_t* out) {342int i;343__m128i pa;344__m128i L = _mm_cvtsi32_si128(out[-1]);345for (i = 0; i + 4 <= num_pixels; i += 4) {346__m128i T = _mm_loadu_si128((const __m128i*)&upper[i]);347__m128i TL = _mm_loadu_si128((const __m128i*)&upper[i - 1]);348__m128i src = _mm_loadu_si128((const __m128i*)&in[i]);349{350// We can unpack with any value on the upper 32 bits, provided it's the351// same on both operands (so that their sum of abs diff is zero). Here we352// use T.353const __m128i T_lo = _mm_unpacklo_epi32(T, T);354const __m128i TL_lo = _mm_unpacklo_epi32(TL, T);355const __m128i T_hi = _mm_unpackhi_epi32(T, T);356const __m128i TL_hi = _mm_unpackhi_epi32(TL, T);357const __m128i s_lo = _mm_sad_epu8(T_lo, TL_lo);358const __m128i s_hi = _mm_sad_epu8(T_hi, TL_hi);359pa = _mm_packs_epi32(s_lo, s_hi); // pa = sum |T-TL|360}361DO_PRED11(0);362DO_PRED11_SHIFT;363DO_PRED11(1);364DO_PRED11_SHIFT;365DO_PRED11(2);366DO_PRED11_SHIFT;367DO_PRED11(3);368}369if (i != num_pixels) {370VP8LPredictorsAdd_C[11](in + i, upper + i, num_pixels - i, out + i);371}372}373#undef DO_PRED11374#undef DO_PRED11_SHIFT375376// Predictor12: ClampedAddSubtractFull.377#define DO_PRED12(DIFF, LANE, OUT) do { \378const __m128i all = _mm_add_epi16(L, (DIFF)); \379const __m128i alls = _mm_packus_epi16(all, all); \380const __m128i res = _mm_add_epi8(src, alls); \381out[i + (OUT)] = _mm_cvtsi128_si32(res); \382L = _mm_unpacklo_epi8(res, zero); \383} while (0)384385#define DO_PRED12_SHIFT(DIFF, LANE) do { \386/* Shift the pre-computed value for the next iteration.*/ \387if ((LANE) == 0) (DIFF) = _mm_srli_si128((DIFF), 8); \388src = _mm_srli_si128(src, 4); \389} while (0)390391static void PredictorAdd12_SSE2(const uint32_t* in, const uint32_t* upper,392int num_pixels, uint32_t* out) {393int i;394const __m128i zero = _mm_setzero_si128();395const __m128i L8 = _mm_cvtsi32_si128(out[-1]);396__m128i L = _mm_unpacklo_epi8(L8, zero);397for (i = 0; i + 4 <= num_pixels; i += 4) {398// Load 4 pixels at a time.399__m128i src = _mm_loadu_si128((const __m128i*)&in[i]);400const __m128i T = _mm_loadu_si128((const __m128i*)&upper[i]);401const __m128i T_lo = _mm_unpacklo_epi8(T, zero);402const __m128i T_hi = _mm_unpackhi_epi8(T, zero);403const __m128i TL = _mm_loadu_si128((const __m128i*)&upper[i - 1]);404const __m128i TL_lo = _mm_unpacklo_epi8(TL, zero);405const __m128i TL_hi = _mm_unpackhi_epi8(TL, zero);406__m128i diff_lo = _mm_sub_epi16(T_lo, TL_lo);407__m128i diff_hi = _mm_sub_epi16(T_hi, TL_hi);408DO_PRED12(diff_lo, 0, 0);409DO_PRED12_SHIFT(diff_lo, 0);410DO_PRED12(diff_lo, 1, 1);411DO_PRED12_SHIFT(diff_lo, 1);412DO_PRED12(diff_hi, 0, 2);413DO_PRED12_SHIFT(diff_hi, 0);414DO_PRED12(diff_hi, 1, 3);415}416if (i != num_pixels) {417VP8LPredictorsAdd_C[12](in + i, upper + i, num_pixels - i, out + i);418}419}420#undef DO_PRED12421#undef DO_PRED12_SHIFT422423// Due to averages with integers, values cannot be accumulated in parallel for424// predictors 13.425GENERATE_PREDICTOR_ADD(Predictor13_SSE2, PredictorAdd13_SSE2)426427//------------------------------------------------------------------------------428// Subtract-Green Transform429430static void AddGreenToBlueAndRed_SSE2(const uint32_t* const src, int num_pixels,431uint32_t* dst) {432int i;433for (i = 0; i + 4 <= num_pixels; i += 4) {434const __m128i in = _mm_loadu_si128((const __m128i*)&src[i]); // argb435const __m128i A = _mm_srli_epi16(in, 8); // 0 a 0 g436const __m128i B = _mm_shufflelo_epi16(A, _MM_SHUFFLE(2, 2, 0, 0));437const __m128i C = _mm_shufflehi_epi16(B, _MM_SHUFFLE(2, 2, 0, 0)); // 0g0g438const __m128i out = _mm_add_epi8(in, C);439_mm_storeu_si128((__m128i*)&dst[i], out);440}441// fallthrough and finish off with plain-C442if (i != num_pixels) {443VP8LAddGreenToBlueAndRed_C(src + i, num_pixels - i, dst + i);444}445}446447//------------------------------------------------------------------------------448// Color Transform449450static void TransformColorInverse_SSE2(const VP8LMultipliers* const m,451const uint32_t* const src,452int num_pixels, uint32_t* dst) {453// sign-extended multiplying constants, pre-shifted by 5.454#define CST(X) (((int16_t)(m->X << 8)) >> 5) // sign-extend455#define MK_CST_16(HI, LO) \456_mm_set1_epi32((int)(((uint32_t)(HI) << 16) | ((LO) & 0xffff)))457const __m128i mults_rb = MK_CST_16(CST(green_to_red_), CST(green_to_blue_));458const __m128i mults_b2 = MK_CST_16(CST(red_to_blue_), 0);459#undef MK_CST_16460#undef CST461const __m128i mask_ag = _mm_set1_epi32(0xff00ff00); // alpha-green masks462int i;463for (i = 0; i + 4 <= num_pixels; i += 4) {464const __m128i in = _mm_loadu_si128((const __m128i*)&src[i]); // argb465const __m128i A = _mm_and_si128(in, mask_ag); // a 0 g 0466const __m128i B = _mm_shufflelo_epi16(A, _MM_SHUFFLE(2, 2, 0, 0));467const __m128i C = _mm_shufflehi_epi16(B, _MM_SHUFFLE(2, 2, 0, 0)); // g0g0468const __m128i D = _mm_mulhi_epi16(C, mults_rb); // x dr x db1469const __m128i E = _mm_add_epi8(in, D); // x r' x b'470const __m128i F = _mm_slli_epi16(E, 8); // r' 0 b' 0471const __m128i G = _mm_mulhi_epi16(F, mults_b2); // x db2 0 0472const __m128i H = _mm_srli_epi32(G, 8); // 0 x db2 0473const __m128i I = _mm_add_epi8(H, F); // r' x b'' 0474const __m128i J = _mm_srli_epi16(I, 8); // 0 r' 0 b''475const __m128i out = _mm_or_si128(J, A);476_mm_storeu_si128((__m128i*)&dst[i], out);477}478// Fall-back to C-version for left-overs.479if (i != num_pixels) {480VP8LTransformColorInverse_C(m, src + i, num_pixels - i, dst + i);481}482}483484//------------------------------------------------------------------------------485// Color-space conversion functions486487static void ConvertBGRAToRGB_SSE2(const uint32_t* src, int num_pixels,488uint8_t* dst) {489const __m128i* in = (const __m128i*)src;490__m128i* out = (__m128i*)dst;491492while (num_pixels >= 32) {493// Load the BGRA buffers.494__m128i in0 = _mm_loadu_si128(in + 0);495__m128i in1 = _mm_loadu_si128(in + 1);496__m128i in2 = _mm_loadu_si128(in + 2);497__m128i in3 = _mm_loadu_si128(in + 3);498__m128i in4 = _mm_loadu_si128(in + 4);499__m128i in5 = _mm_loadu_si128(in + 5);500__m128i in6 = _mm_loadu_si128(in + 6);501__m128i in7 = _mm_loadu_si128(in + 7);502VP8L32bToPlanar_SSE2(&in0, &in1, &in2, &in3);503VP8L32bToPlanar_SSE2(&in4, &in5, &in6, &in7);504// At this points, in1/in5 contains red only, in2/in6 green only ...505// Pack the colors in 24b RGB.506VP8PlanarTo24b_SSE2(&in1, &in5, &in2, &in6, &in3, &in7);507_mm_storeu_si128(out + 0, in1);508_mm_storeu_si128(out + 1, in5);509_mm_storeu_si128(out + 2, in2);510_mm_storeu_si128(out + 3, in6);511_mm_storeu_si128(out + 4, in3);512_mm_storeu_si128(out + 5, in7);513in += 8;514out += 6;515num_pixels -= 32;516}517// left-overs518if (num_pixels > 0) {519VP8LConvertBGRAToRGB_C((const uint32_t*)in, num_pixels, (uint8_t*)out);520}521}522523static void ConvertBGRAToRGBA_SSE2(const uint32_t* src,524int num_pixels, uint8_t* dst) {525const __m128i red_blue_mask = _mm_set1_epi32(0x00ff00ffu);526const __m128i* in = (const __m128i*)src;527__m128i* out = (__m128i*)dst;528while (num_pixels >= 8) {529const __m128i A1 = _mm_loadu_si128(in++);530const __m128i A2 = _mm_loadu_si128(in++);531const __m128i B1 = _mm_and_si128(A1, red_blue_mask); // R 0 B 0532const __m128i B2 = _mm_and_si128(A2, red_blue_mask); // R 0 B 0533const __m128i C1 = _mm_andnot_si128(red_blue_mask, A1); // 0 G 0 A534const __m128i C2 = _mm_andnot_si128(red_blue_mask, A2); // 0 G 0 A535const __m128i D1 = _mm_shufflelo_epi16(B1, _MM_SHUFFLE(2, 3, 0, 1));536const __m128i D2 = _mm_shufflelo_epi16(B2, _MM_SHUFFLE(2, 3, 0, 1));537const __m128i E1 = _mm_shufflehi_epi16(D1, _MM_SHUFFLE(2, 3, 0, 1));538const __m128i E2 = _mm_shufflehi_epi16(D2, _MM_SHUFFLE(2, 3, 0, 1));539const __m128i F1 = _mm_or_si128(E1, C1);540const __m128i F2 = _mm_or_si128(E2, C2);541_mm_storeu_si128(out++, F1);542_mm_storeu_si128(out++, F2);543num_pixels -= 8;544}545// left-overs546if (num_pixels > 0) {547VP8LConvertBGRAToRGBA_C((const uint32_t*)in, num_pixels, (uint8_t*)out);548}549}550551static void ConvertBGRAToRGBA4444_SSE2(const uint32_t* src,552int num_pixels, uint8_t* dst) {553const __m128i mask_0x0f = _mm_set1_epi8(0x0f);554const __m128i mask_0xf0 = _mm_set1_epi8(0xf0);555const __m128i* in = (const __m128i*)src;556__m128i* out = (__m128i*)dst;557while (num_pixels >= 8) {558const __m128i bgra0 = _mm_loadu_si128(in++); // bgra0|bgra1|bgra2|bgra3559const __m128i bgra4 = _mm_loadu_si128(in++); // bgra4|bgra5|bgra6|bgra7560const __m128i v0l = _mm_unpacklo_epi8(bgra0, bgra4); // b0b4g0g4r0r4a0a4...561const __m128i v0h = _mm_unpackhi_epi8(bgra0, bgra4); // b2b6g2g6r2r6a2a6...562const __m128i v1l = _mm_unpacklo_epi8(v0l, v0h); // b0b2b4b6g0g2g4g6...563const __m128i v1h = _mm_unpackhi_epi8(v0l, v0h); // b1b3b5b7g1g3g5g7...564const __m128i v2l = _mm_unpacklo_epi8(v1l, v1h); // b0...b7 | g0...g7565const __m128i v2h = _mm_unpackhi_epi8(v1l, v1h); // r0...r7 | a0...a7566const __m128i ga0 = _mm_unpackhi_epi64(v2l, v2h); // g0...g7 | a0...a7567const __m128i rb0 = _mm_unpacklo_epi64(v2h, v2l); // r0...r7 | b0...b7568const __m128i ga1 = _mm_srli_epi16(ga0, 4); // g0-|g1-|...|a6-|a7-569const __m128i rb1 = _mm_and_si128(rb0, mask_0xf0); // -r0|-r1|...|-b6|-a7570const __m128i ga2 = _mm_and_si128(ga1, mask_0x0f); // g0-|g1-|...|a6-|a7-571const __m128i rgba0 = _mm_or_si128(ga2, rb1); // rg0..rg7 | ba0..ba7572const __m128i rgba1 = _mm_srli_si128(rgba0, 8); // ba0..ba7 | 0573#if (WEBP_SWAP_16BIT_CSP == 1)574const __m128i rgba = _mm_unpacklo_epi8(rgba1, rgba0); // barg0...barg7575#else576const __m128i rgba = _mm_unpacklo_epi8(rgba0, rgba1); // rgba0...rgba7577#endif578_mm_storeu_si128(out++, rgba);579num_pixels -= 8;580}581// left-overs582if (num_pixels > 0) {583VP8LConvertBGRAToRGBA4444_C((const uint32_t*)in, num_pixels, (uint8_t*)out);584}585}586587static void ConvertBGRAToRGB565_SSE2(const uint32_t* src,588int num_pixels, uint8_t* dst) {589const __m128i mask_0xe0 = _mm_set1_epi8(0xe0);590const __m128i mask_0xf8 = _mm_set1_epi8(0xf8);591const __m128i mask_0x07 = _mm_set1_epi8(0x07);592const __m128i* in = (const __m128i*)src;593__m128i* out = (__m128i*)dst;594while (num_pixels >= 8) {595const __m128i bgra0 = _mm_loadu_si128(in++); // bgra0|bgra1|bgra2|bgra3596const __m128i bgra4 = _mm_loadu_si128(in++); // bgra4|bgra5|bgra6|bgra7597const __m128i v0l = _mm_unpacklo_epi8(bgra0, bgra4); // b0b4g0g4r0r4a0a4...598const __m128i v0h = _mm_unpackhi_epi8(bgra0, bgra4); // b2b6g2g6r2r6a2a6...599const __m128i v1l = _mm_unpacklo_epi8(v0l, v0h); // b0b2b4b6g0g2g4g6...600const __m128i v1h = _mm_unpackhi_epi8(v0l, v0h); // b1b3b5b7g1g3g5g7...601const __m128i v2l = _mm_unpacklo_epi8(v1l, v1h); // b0...b7 | g0...g7602const __m128i v2h = _mm_unpackhi_epi8(v1l, v1h); // r0...r7 | a0...a7603const __m128i ga0 = _mm_unpackhi_epi64(v2l, v2h); // g0...g7 | a0...a7604const __m128i rb0 = _mm_unpacklo_epi64(v2h, v2l); // r0...r7 | b0...b7605const __m128i rb1 = _mm_and_si128(rb0, mask_0xf8); // -r0..-r7|-b0..-b7606const __m128i g_lo1 = _mm_srli_epi16(ga0, 5);607const __m128i g_lo2 = _mm_and_si128(g_lo1, mask_0x07); // g0-...g7-|xx (3b)608const __m128i g_hi1 = _mm_slli_epi16(ga0, 3);609const __m128i g_hi2 = _mm_and_si128(g_hi1, mask_0xe0); // -g0...-g7|xx (3b)610const __m128i b0 = _mm_srli_si128(rb1, 8); // -b0...-b7|0611const __m128i rg1 = _mm_or_si128(rb1, g_lo2); // gr0...gr7|xx612const __m128i b1 = _mm_srli_epi16(b0, 3);613const __m128i gb1 = _mm_or_si128(b1, g_hi2); // bg0...bg7|xx614#if (WEBP_SWAP_16BIT_CSP == 1)615const __m128i rgba = _mm_unpacklo_epi8(gb1, rg1); // rggb0...rggb7616#else617const __m128i rgba = _mm_unpacklo_epi8(rg1, gb1); // bgrb0...bgrb7618#endif619_mm_storeu_si128(out++, rgba);620num_pixels -= 8;621}622// left-overs623if (num_pixels > 0) {624VP8LConvertBGRAToRGB565_C((const uint32_t*)in, num_pixels, (uint8_t*)out);625}626}627628static void ConvertBGRAToBGR_SSE2(const uint32_t* src,629int num_pixels, uint8_t* dst) {630const __m128i mask_l = _mm_set_epi32(0, 0x00ffffff, 0, 0x00ffffff);631const __m128i mask_h = _mm_set_epi32(0x00ffffff, 0, 0x00ffffff, 0);632const __m128i* in = (const __m128i*)src;633const uint8_t* const end = dst + num_pixels * 3;634// the last storel_epi64 below writes 8 bytes starting at offset 18635while (dst + 26 <= end) {636const __m128i bgra0 = _mm_loadu_si128(in++); // bgra0|bgra1|bgra2|bgra3637const __m128i bgra4 = _mm_loadu_si128(in++); // bgra4|bgra5|bgra6|bgra7638const __m128i a0l = _mm_and_si128(bgra0, mask_l); // bgr0|0|bgr0|0639const __m128i a4l = _mm_and_si128(bgra4, mask_l); // bgr0|0|bgr0|0640const __m128i a0h = _mm_and_si128(bgra0, mask_h); // 0|bgr0|0|bgr0641const __m128i a4h = _mm_and_si128(bgra4, mask_h); // 0|bgr0|0|bgr0642const __m128i b0h = _mm_srli_epi64(a0h, 8); // 000b|gr00|000b|gr00643const __m128i b4h = _mm_srli_epi64(a4h, 8); // 000b|gr00|000b|gr00644const __m128i c0 = _mm_or_si128(a0l, b0h); // rgbrgb00|rgbrgb00645const __m128i c4 = _mm_or_si128(a4l, b4h); // rgbrgb00|rgbrgb00646const __m128i c2 = _mm_srli_si128(c0, 8);647const __m128i c6 = _mm_srli_si128(c4, 8);648_mm_storel_epi64((__m128i*)(dst + 0), c0);649_mm_storel_epi64((__m128i*)(dst + 6), c2);650_mm_storel_epi64((__m128i*)(dst + 12), c4);651_mm_storel_epi64((__m128i*)(dst + 18), c6);652dst += 24;653num_pixels -= 8;654}655// left-overs656if (num_pixels > 0) {657VP8LConvertBGRAToBGR_C((const uint32_t*)in, num_pixels, dst);658}659}660661//------------------------------------------------------------------------------662// Entry point663664extern void VP8LDspInitSSE2(void);665666WEBP_TSAN_IGNORE_FUNCTION void VP8LDspInitSSE2(void) {667VP8LPredictors[5] = Predictor5_SSE2;668VP8LPredictors[6] = Predictor6_SSE2;669VP8LPredictors[7] = Predictor7_SSE2;670VP8LPredictors[8] = Predictor8_SSE2;671VP8LPredictors[9] = Predictor9_SSE2;672VP8LPredictors[10] = Predictor10_SSE2;673VP8LPredictors[11] = Predictor11_SSE2;674VP8LPredictors[12] = Predictor12_SSE2;675VP8LPredictors[13] = Predictor13_SSE2;676677VP8LPredictorsAdd[0] = PredictorAdd0_SSE2;678VP8LPredictorsAdd[1] = PredictorAdd1_SSE2;679VP8LPredictorsAdd[2] = PredictorAdd2_SSE2;680VP8LPredictorsAdd[3] = PredictorAdd3_SSE2;681VP8LPredictorsAdd[4] = PredictorAdd4_SSE2;682VP8LPredictorsAdd[5] = PredictorAdd5_SSE2;683VP8LPredictorsAdd[6] = PredictorAdd6_SSE2;684VP8LPredictorsAdd[7] = PredictorAdd7_SSE2;685VP8LPredictorsAdd[8] = PredictorAdd8_SSE2;686VP8LPredictorsAdd[9] = PredictorAdd9_SSE2;687VP8LPredictorsAdd[10] = PredictorAdd10_SSE2;688VP8LPredictorsAdd[11] = PredictorAdd11_SSE2;689VP8LPredictorsAdd[12] = PredictorAdd12_SSE2;690VP8LPredictorsAdd[13] = PredictorAdd13_SSE2;691692VP8LAddGreenToBlueAndRed = AddGreenToBlueAndRed_SSE2;693VP8LTransformColorInverse = TransformColorInverse_SSE2;694695VP8LConvertBGRAToRGB = ConvertBGRAToRGB_SSE2;696VP8LConvertBGRAToRGBA = ConvertBGRAToRGBA_SSE2;697VP8LConvertBGRAToRGBA4444 = ConvertBGRAToRGBA4444_SSE2;698VP8LConvertBGRAToRGB565 = ConvertBGRAToRGB565_SSE2;699VP8LConvertBGRAToBGR = ConvertBGRAToBGR_SSE2;700}701702#else // !WEBP_USE_SSE2703704WEBP_DSP_INIT_STUB(VP8LDspInitSSE2)705706#endif // WEBP_USE_SSE2707708709