Path: blob/master/thirdparty/libwebp/src/dsp/dec_msa.c
9913 views
// Copyright 2016 Google Inc. All Rights Reserved.1//2// Use of this source code is governed by a BSD-style license3// that can be found in the COPYING file in the root of the source4// tree. An additional intellectual property rights grant can be found5// in the file PATENTS. All contributing project authors may6// be found in the AUTHORS file in the root of the source tree.7// -----------------------------------------------------------------------------8//9// MSA version of dsp functions10//11// Author(s): Prashant Patil ([email protected])121314#include "src/dsp/dsp.h"1516#if defined(WEBP_USE_MSA)1718#include "src/dsp/msa_macro.h"1920//------------------------------------------------------------------------------21// Transforms2223#define IDCT_1D_W(in0, in1, in2, in3, out0, out1, out2, out3) { \24v4i32 a1_m, b1_m, c1_m, d1_m; \25v4i32 c_tmp1_m, c_tmp2_m, d_tmp1_m, d_tmp2_m; \26const v4i32 cospi8sqrt2minus1 = __msa_fill_w(20091); \27const v4i32 sinpi8sqrt2 = __msa_fill_w(35468); \28\29a1_m = in0 + in2; \30b1_m = in0 - in2; \31c_tmp1_m = (in1 * sinpi8sqrt2) >> 16; \32c_tmp2_m = in3 + ((in3 * cospi8sqrt2minus1) >> 16); \33c1_m = c_tmp1_m - c_tmp2_m; \34d_tmp1_m = in1 + ((in1 * cospi8sqrt2minus1) >> 16); \35d_tmp2_m = (in3 * sinpi8sqrt2) >> 16; \36d1_m = d_tmp1_m + d_tmp2_m; \37BUTTERFLY_4(a1_m, b1_m, c1_m, d1_m, out0, out1, out2, out3); \38}3940static void TransformOne(const int16_t* WEBP_RESTRICT in,41uint8_t* WEBP_RESTRICT dst) {42v8i16 input0, input1;43v4i32 in0, in1, in2, in3, hz0, hz1, hz2, hz3, vt0, vt1, vt2, vt3;44v4i32 res0, res1, res2, res3;45const v16i8 zero = { 0 };46v16i8 dest0, dest1, dest2, dest3;4748LD_SH2(in, 8, input0, input1);49UNPCK_SH_SW(input0, in0, in1);50UNPCK_SH_SW(input1, in2, in3);51IDCT_1D_W(in0, in1, in2, in3, hz0, hz1, hz2, hz3);52TRANSPOSE4x4_SW_SW(hz0, hz1, hz2, hz3, hz0, hz1, hz2, hz3);53IDCT_1D_W(hz0, hz1, hz2, hz3, vt0, vt1, vt2, vt3);54SRARI_W4_SW(vt0, vt1, vt2, vt3, 3);55TRANSPOSE4x4_SW_SW(vt0, vt1, vt2, vt3, vt0, vt1, vt2, vt3);56LD_SB4(dst, BPS, dest0, dest1, dest2, dest3);57ILVR_B4_SW(zero, dest0, zero, dest1, zero, dest2, zero, dest3,58res0, res1, res2, res3);59ILVR_H4_SW(zero, res0, zero, res1, zero, res2, zero, res3,60res0, res1, res2, res3);61ADD4(res0, vt0, res1, vt1, res2, vt2, res3, vt3, res0, res1, res2, res3);62CLIP_SW4_0_255(res0, res1, res2, res3);63PCKEV_B2_SW(res0, res1, res2, res3, vt0, vt1);64res0 = (v4i32)__msa_pckev_b((v16i8)vt0, (v16i8)vt1);65ST4x4_UB(res0, res0, 3, 2, 1, 0, dst, BPS);66}6768static void TransformTwo(const int16_t* WEBP_RESTRICT in,69uint8_t* WEBP_RESTRICT dst, int do_two) {70TransformOne(in, dst);71if (do_two) {72TransformOne(in + 16, dst + 4);73}74}7576static void TransformWHT(const int16_t* WEBP_RESTRICT in,77int16_t* WEBP_RESTRICT out) {78v8i16 input0, input1;79const v8i16 mask0 = { 0, 1, 2, 3, 8, 9, 10, 11 };80const v8i16 mask1 = { 4, 5, 6, 7, 12, 13, 14, 15 };81const v8i16 mask2 = { 0, 4, 8, 12, 1, 5, 9, 13 };82const v8i16 mask3 = { 3, 7, 11, 15, 2, 6, 10, 14 };83v8i16 tmp0, tmp1, tmp2, tmp3;84v8i16 out0, out1;8586LD_SH2(in, 8, input0, input1);87input1 = SLDI_SH(input1, input1, 8);88tmp0 = input0 + input1;89tmp1 = input0 - input1;90VSHF_H2_SH(tmp0, tmp1, tmp0, tmp1, mask0, mask1, tmp2, tmp3);91out0 = tmp2 + tmp3;92out1 = tmp2 - tmp3;93VSHF_H2_SH(out0, out1, out0, out1, mask2, mask3, input0, input1);94tmp0 = input0 + input1;95tmp1 = input0 - input1;96VSHF_H2_SH(tmp0, tmp1, tmp0, tmp1, mask0, mask1, tmp2, tmp3);97tmp0 = tmp2 + tmp3;98tmp1 = tmp2 - tmp3;99ADDVI_H2_SH(tmp0, 3, tmp1, 3, out0, out1);100SRAI_H2_SH(out0, out1, 3);101out[0] = __msa_copy_s_h(out0, 0);102out[16] = __msa_copy_s_h(out0, 4);103out[32] = __msa_copy_s_h(out1, 0);104out[48] = __msa_copy_s_h(out1, 4);105out[64] = __msa_copy_s_h(out0, 1);106out[80] = __msa_copy_s_h(out0, 5);107out[96] = __msa_copy_s_h(out1, 1);108out[112] = __msa_copy_s_h(out1, 5);109out[128] = __msa_copy_s_h(out0, 2);110out[144] = __msa_copy_s_h(out0, 6);111out[160] = __msa_copy_s_h(out1, 2);112out[176] = __msa_copy_s_h(out1, 6);113out[192] = __msa_copy_s_h(out0, 3);114out[208] = __msa_copy_s_h(out0, 7);115out[224] = __msa_copy_s_h(out1, 3);116out[240] = __msa_copy_s_h(out1, 7);117}118119static void TransformDC(const int16_t* WEBP_RESTRICT in,120uint8_t* WEBP_RESTRICT dst) {121const int DC = (in[0] + 4) >> 3;122const v8i16 tmp0 = __msa_fill_h(DC);123ADDBLK_ST4x4_UB(tmp0, tmp0, tmp0, tmp0, dst, BPS);124}125126static void TransformAC3(const int16_t* WEBP_RESTRICT in,127uint8_t* WEBP_RESTRICT dst) {128const int a = in[0] + 4;129const int c4 = WEBP_TRANSFORM_AC3_MUL2(in[4]);130const int d4 = WEBP_TRANSFORM_AC3_MUL1(in[4]);131const int in2 = WEBP_TRANSFORM_AC3_MUL2(in[1]);132const int in3 = WEBP_TRANSFORM_AC3_MUL1(in[1]);133v4i32 tmp0 = { 0 };134v4i32 out0 = __msa_fill_w(a + d4);135v4i32 out1 = __msa_fill_w(a + c4);136v4i32 out2 = __msa_fill_w(a - c4);137v4i32 out3 = __msa_fill_w(a - d4);138v4i32 res0, res1, res2, res3;139const v4i32 zero = { 0 };140v16u8 dest0, dest1, dest2, dest3;141142INSERT_W4_SW(in3, in2, -in2, -in3, tmp0);143ADD4(out0, tmp0, out1, tmp0, out2, tmp0, out3, tmp0,144out0, out1, out2, out3);145SRAI_W4_SW(out0, out1, out2, out3, 3);146LD_UB4(dst, BPS, dest0, dest1, dest2, dest3);147ILVR_B4_SW(zero, dest0, zero, dest1, zero, dest2, zero, dest3,148res0, res1, res2, res3);149ILVR_H4_SW(zero, res0, zero, res1, zero, res2, zero, res3,150res0, res1, res2, res3);151ADD4(res0, out0, res1, out1, res2, out2, res3, out3, res0, res1, res2, res3);152CLIP_SW4_0_255(res0, res1, res2, res3);153PCKEV_B2_SW(res0, res1, res2, res3, out0, out1);154res0 = (v4i32)__msa_pckev_b((v16i8)out0, (v16i8)out1);155ST4x4_UB(res0, res0, 3, 2, 1, 0, dst, BPS);156}157158//------------------------------------------------------------------------------159// Edge filtering functions160161#define FLIP_SIGN2(in0, in1, out0, out1) { \162out0 = (v16i8)__msa_xori_b(in0, 0x80); \163out1 = (v16i8)__msa_xori_b(in1, 0x80); \164}165166#define FLIP_SIGN4(in0, in1, in2, in3, out0, out1, out2, out3) { \167FLIP_SIGN2(in0, in1, out0, out1); \168FLIP_SIGN2(in2, in3, out2, out3); \169}170171#define FILT_VAL(q0_m, p0_m, mask, filt) do { \172v16i8 q0_sub_p0; \173q0_sub_p0 = __msa_subs_s_b(q0_m, p0_m); \174filt = __msa_adds_s_b(filt, q0_sub_p0); \175filt = __msa_adds_s_b(filt, q0_sub_p0); \176filt = __msa_adds_s_b(filt, q0_sub_p0); \177filt = filt & mask; \178} while (0)179180#define FILT2(q_m, p_m, q, p) do { \181u_r = SRAI_H(temp1, 7); \182u_r = __msa_sat_s_h(u_r, 7); \183u_l = SRAI_H(temp3, 7); \184u_l = __msa_sat_s_h(u_l, 7); \185u = __msa_pckev_b((v16i8)u_l, (v16i8)u_r); \186q_m = __msa_subs_s_b(q_m, u); \187p_m = __msa_adds_s_b(p_m, u); \188q = __msa_xori_b((v16u8)q_m, 0x80); \189p = __msa_xori_b((v16u8)p_m, 0x80); \190} while (0)191192#define LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev) do { \193v16i8 p1_m, p0_m, q0_m, q1_m; \194v16i8 filt, t1, t2; \195const v16i8 cnst4b = __msa_ldi_b(4); \196const v16i8 cnst3b = __msa_ldi_b(3); \197\198FLIP_SIGN4(p1, p0, q0, q1, p1_m, p0_m, q0_m, q1_m); \199filt = __msa_subs_s_b(p1_m, q1_m); \200filt = filt & hev; \201FILT_VAL(q0_m, p0_m, mask, filt); \202t1 = __msa_adds_s_b(filt, cnst4b); \203t1 = SRAI_B(t1, 3); \204t2 = __msa_adds_s_b(filt, cnst3b); \205t2 = SRAI_B(t2, 3); \206q0_m = __msa_subs_s_b(q0_m, t1); \207q0 = __msa_xori_b((v16u8)q0_m, 0x80); \208p0_m = __msa_adds_s_b(p0_m, t2); \209p0 = __msa_xori_b((v16u8)p0_m, 0x80); \210filt = __msa_srari_b(t1, 1); \211hev = __msa_xori_b(hev, 0xff); \212filt = filt & hev; \213q1_m = __msa_subs_s_b(q1_m, filt); \214q1 = __msa_xori_b((v16u8)q1_m, 0x80); \215p1_m = __msa_adds_s_b(p1_m, filt); \216p1 = __msa_xori_b((v16u8)p1_m, 0x80); \217} while (0)218219#define LPF_MBFILTER(p2, p1, p0, q0, q1, q2, mask, hev) do { \220v16i8 p2_m, p1_m, p0_m, q2_m, q1_m, q0_m; \221v16i8 u, filt, t1, t2, filt_sign; \222v8i16 filt_r, filt_l, u_r, u_l; \223v8i16 temp0, temp1, temp2, temp3; \224const v16i8 cnst4b = __msa_ldi_b(4); \225const v16i8 cnst3b = __msa_ldi_b(3); \226const v8i16 cnst9h = __msa_ldi_h(9); \227const v8i16 cnst63h = __msa_ldi_h(63); \228\229FLIP_SIGN4(p1, p0, q0, q1, p1_m, p0_m, q0_m, q1_m); \230filt = __msa_subs_s_b(p1_m, q1_m); \231FILT_VAL(q0_m, p0_m, mask, filt); \232FLIP_SIGN2(p2, q2, p2_m, q2_m); \233t2 = filt & hev; \234/* filt_val &= ~hev */ \235hev = __msa_xori_b(hev, 0xff); \236filt = filt & hev; \237t1 = __msa_adds_s_b(t2, cnst4b); \238t1 = SRAI_B(t1, 3); \239t2 = __msa_adds_s_b(t2, cnst3b); \240t2 = SRAI_B(t2, 3); \241q0_m = __msa_subs_s_b(q0_m, t1); \242p0_m = __msa_adds_s_b(p0_m, t2); \243filt_sign = __msa_clti_s_b(filt, 0); \244ILVRL_B2_SH(filt_sign, filt, filt_r, filt_l); \245/* update q2/p2 */ \246temp0 = filt_r * cnst9h; \247temp1 = temp0 + cnst63h; \248temp2 = filt_l * cnst9h; \249temp3 = temp2 + cnst63h; \250FILT2(q2_m, p2_m, q2, p2); \251/* update q1/p1 */ \252temp1 = temp1 + temp0; \253temp3 = temp3 + temp2; \254FILT2(q1_m, p1_m, q1, p1); \255/* update q0/p0 */ \256temp1 = temp1 + temp0; \257temp3 = temp3 + temp2; \258FILT2(q0_m, p0_m, q0, p0); \259} while (0)260261#define LPF_MASK_HEV(p3_in, p2_in, p1_in, p0_in, \262q0_in, q1_in, q2_in, q3_in, \263limit_in, b_limit_in, thresh_in, \264hev_out, mask_out) do { \265v16u8 p3_asub_p2_m, p2_asub_p1_m, p1_asub_p0_m, q1_asub_q0_m; \266v16u8 p1_asub_q1_m, p0_asub_q0_m, q3_asub_q2_m, q2_asub_q1_m; \267v16u8 flat_out; \268\269/* absolute subtraction of pixel values */ \270p3_asub_p2_m = __msa_asub_u_b(p3_in, p2_in); \271p2_asub_p1_m = __msa_asub_u_b(p2_in, p1_in); \272p1_asub_p0_m = __msa_asub_u_b(p1_in, p0_in); \273q1_asub_q0_m = __msa_asub_u_b(q1_in, q0_in); \274q2_asub_q1_m = __msa_asub_u_b(q2_in, q1_in); \275q3_asub_q2_m = __msa_asub_u_b(q3_in, q2_in); \276p0_asub_q0_m = __msa_asub_u_b(p0_in, q0_in); \277p1_asub_q1_m = __msa_asub_u_b(p1_in, q1_in); \278/* calculation of hev */ \279flat_out = __msa_max_u_b(p1_asub_p0_m, q1_asub_q0_m); \280hev_out = (thresh_in < flat_out); \281/* calculation of mask */ \282p0_asub_q0_m = __msa_adds_u_b(p0_asub_q0_m, p0_asub_q0_m); \283p1_asub_q1_m = SRAI_B(p1_asub_q1_m, 1); \284p0_asub_q0_m = __msa_adds_u_b(p0_asub_q0_m, p1_asub_q1_m); \285mask_out = (b_limit_in < p0_asub_q0_m); \286mask_out = __msa_max_u_b(flat_out, mask_out); \287p3_asub_p2_m = __msa_max_u_b(p3_asub_p2_m, p2_asub_p1_m); \288mask_out = __msa_max_u_b(p3_asub_p2_m, mask_out); \289q2_asub_q1_m = __msa_max_u_b(q2_asub_q1_m, q3_asub_q2_m); \290mask_out = __msa_max_u_b(q2_asub_q1_m, mask_out); \291mask_out = (limit_in < mask_out); \292mask_out = __msa_xori_b(mask_out, 0xff); \293} while (0)294295#define ST6x1_UB(in0, in0_idx, in1, in1_idx, pdst, stride) do { \296const uint16_t tmp0_h = __msa_copy_s_h((v8i16)in1, in1_idx); \297const uint32_t tmp0_w = __msa_copy_s_w((v4i32)in0, in0_idx); \298SW(tmp0_w, pdst); \299SH(tmp0_h, pdst + stride); \300} while (0)301302#define ST6x4_UB(in0, start_in0_idx, in1, start_in1_idx, pdst, stride) do { \303uint8_t* ptmp1 = (uint8_t*)pdst; \304ST6x1_UB(in0, start_in0_idx, in1, start_in1_idx, ptmp1, 4); \305ptmp1 += stride; \306ST6x1_UB(in0, start_in0_idx + 1, in1, start_in1_idx + 1, ptmp1, 4); \307ptmp1 += stride; \308ST6x1_UB(in0, start_in0_idx + 2, in1, start_in1_idx + 2, ptmp1, 4); \309ptmp1 += stride; \310ST6x1_UB(in0, start_in0_idx + 3, in1, start_in1_idx + 3, ptmp1, 4); \311} while (0)312313#define LPF_SIMPLE_FILT(p1_in, p0_in, q0_in, q1_in, mask) do { \314v16i8 p1_m, p0_m, q0_m, q1_m, filt, filt1, filt2; \315const v16i8 cnst4b = __msa_ldi_b(4); \316const v16i8 cnst3b = __msa_ldi_b(3); \317\318FLIP_SIGN4(p1_in, p0_in, q0_in, q1_in, p1_m, p0_m, q0_m, q1_m); \319filt = __msa_subs_s_b(p1_m, q1_m); \320FILT_VAL(q0_m, p0_m, mask, filt); \321filt1 = __msa_adds_s_b(filt, cnst4b); \322filt1 = SRAI_B(filt1, 3); \323filt2 = __msa_adds_s_b(filt, cnst3b); \324filt2 = SRAI_B(filt2, 3); \325q0_m = __msa_subs_s_b(q0_m, filt1); \326p0_m = __msa_adds_s_b(p0_m, filt2); \327q0_in = __msa_xori_b((v16u8)q0_m, 0x80); \328p0_in = __msa_xori_b((v16u8)p0_m, 0x80); \329} while (0)330331#define LPF_SIMPLE_MASK(p1, p0, q0, q1, b_limit, mask) do { \332v16u8 p1_a_sub_q1, p0_a_sub_q0; \333\334p0_a_sub_q0 = __msa_asub_u_b(p0, q0); \335p1_a_sub_q1 = __msa_asub_u_b(p1, q1); \336p1_a_sub_q1 = (v16u8)__msa_srli_b((v16i8)p1_a_sub_q1, 1); \337p0_a_sub_q0 = __msa_adds_u_b(p0_a_sub_q0, p0_a_sub_q0); \338mask = __msa_adds_u_b(p0_a_sub_q0, p1_a_sub_q1); \339mask = (mask <= b_limit); \340} while (0)341342static void VFilter16(uint8_t* src, int stride,343int b_limit_in, int limit_in, int thresh_in) {344uint8_t* ptemp = src - 4 * stride;345v16u8 p3, p2, p1, p0, q3, q2, q1, q0;346v16u8 mask, hev;347const v16u8 thresh = (v16u8)__msa_fill_b(thresh_in);348const v16u8 limit = (v16u8)__msa_fill_b(limit_in);349const v16u8 b_limit = (v16u8)__msa_fill_b(b_limit_in);350351LD_UB8(ptemp, stride, p3, p2, p1, p0, q0, q1, q2, q3);352LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,353hev, mask);354LPF_MBFILTER(p2, p1, p0, q0, q1, q2, mask, hev);355ptemp = src - 3 * stride;356ST_UB4(p2, p1, p0, q0, ptemp, stride);357ptemp += (4 * stride);358ST_UB2(q1, q2, ptemp, stride);359}360361static void HFilter16(uint8_t* src, int stride,362int b_limit_in, int limit_in, int thresh_in) {363uint8_t* ptmp = src - 4;364v16u8 p3, p2, p1, p0, q3, q2, q1, q0;365v16u8 mask, hev;366v16u8 row0, row1, row2, row3, row4, row5, row6, row7, row8;367v16u8 row9, row10, row11, row12, row13, row14, row15;368v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;369const v16u8 b_limit = (v16u8)__msa_fill_b(b_limit_in);370const v16u8 limit = (v16u8)__msa_fill_b(limit_in);371const v16u8 thresh = (v16u8)__msa_fill_b(thresh_in);372373LD_UB8(ptmp, stride, row0, row1, row2, row3, row4, row5, row6, row7);374ptmp += (8 * stride);375LD_UB8(ptmp, stride, row8, row9, row10, row11, row12, row13, row14, row15);376TRANSPOSE16x8_UB_UB(row0, row1, row2, row3, row4, row5, row6, row7,377row8, row9, row10, row11, row12, row13, row14, row15,378p3, p2, p1, p0, q0, q1, q2, q3);379LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,380hev, mask);381LPF_MBFILTER(p2, p1, p0, q0, q1, q2, mask, hev);382ILVR_B2_SH(p1, p2, q0, p0, tmp0, tmp1);383ILVRL_H2_SH(tmp1, tmp0, tmp3, tmp4);384ILVL_B2_SH(p1, p2, q0, p0, tmp0, tmp1);385ILVRL_H2_SH(tmp1, tmp0, tmp6, tmp7);386ILVRL_B2_SH(q2, q1, tmp2, tmp5);387ptmp = src - 3;388ST6x1_UB(tmp3, 0, tmp2, 0, ptmp, 4);389ptmp += stride;390ST6x1_UB(tmp3, 1, tmp2, 1, ptmp, 4);391ptmp += stride;392ST6x1_UB(tmp3, 2, tmp2, 2, ptmp, 4);393ptmp += stride;394ST6x1_UB(tmp3, 3, tmp2, 3, ptmp, 4);395ptmp += stride;396ST6x1_UB(tmp4, 0, tmp2, 4, ptmp, 4);397ptmp += stride;398ST6x1_UB(tmp4, 1, tmp2, 5, ptmp, 4);399ptmp += stride;400ST6x1_UB(tmp4, 2, tmp2, 6, ptmp, 4);401ptmp += stride;402ST6x1_UB(tmp4, 3, tmp2, 7, ptmp, 4);403ptmp += stride;404ST6x1_UB(tmp6, 0, tmp5, 0, ptmp, 4);405ptmp += stride;406ST6x1_UB(tmp6, 1, tmp5, 1, ptmp, 4);407ptmp += stride;408ST6x1_UB(tmp6, 2, tmp5, 2, ptmp, 4);409ptmp += stride;410ST6x1_UB(tmp6, 3, tmp5, 3, ptmp, 4);411ptmp += stride;412ST6x1_UB(tmp7, 0, tmp5, 4, ptmp, 4);413ptmp += stride;414ST6x1_UB(tmp7, 1, tmp5, 5, ptmp, 4);415ptmp += stride;416ST6x1_UB(tmp7, 2, tmp5, 6, ptmp, 4);417ptmp += stride;418ST6x1_UB(tmp7, 3, tmp5, 7, ptmp, 4);419}420421// on three inner edges422static void VFilterHorEdge16i(uint8_t* src, int stride,423int b_limit, int limit, int thresh) {424v16u8 mask, hev;425v16u8 p3, p2, p1, p0, q3, q2, q1, q0;426const v16u8 thresh0 = (v16u8)__msa_fill_b(thresh);427const v16u8 b_limit0 = (v16u8)__msa_fill_b(b_limit);428const v16u8 limit0 = (v16u8)__msa_fill_b(limit);429430LD_UB8((src - 4 * stride), stride, p3, p2, p1, p0, q0, q1, q2, q3);431LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit0, b_limit0, thresh0,432hev, mask);433LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev);434ST_UB4(p1, p0, q0, q1, (src - 2 * stride), stride);435}436437static void VFilter16i(uint8_t* src_y, int stride,438int b_limit, int limit, int thresh) {439VFilterHorEdge16i(src_y + 4 * stride, stride, b_limit, limit, thresh);440VFilterHorEdge16i(src_y + 8 * stride, stride, b_limit, limit, thresh);441VFilterHorEdge16i(src_y + 12 * stride, stride, b_limit, limit, thresh);442}443444static void HFilterVertEdge16i(uint8_t* src, int stride,445int b_limit, int limit, int thresh) {446v16u8 mask, hev;447v16u8 p3, p2, p1, p0, q3, q2, q1, q0;448v16u8 row0, row1, row2, row3, row4, row5, row6, row7;449v16u8 row8, row9, row10, row11, row12, row13, row14, row15;450v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;451const v16u8 thresh0 = (v16u8)__msa_fill_b(thresh);452const v16u8 b_limit0 = (v16u8)__msa_fill_b(b_limit);453const v16u8 limit0 = (v16u8)__msa_fill_b(limit);454455LD_UB8(src - 4, stride, row0, row1, row2, row3, row4, row5, row6, row7);456LD_UB8(src - 4 + (8 * stride), stride,457row8, row9, row10, row11, row12, row13, row14, row15);458TRANSPOSE16x8_UB_UB(row0, row1, row2, row3, row4, row5, row6, row7,459row8, row9, row10, row11, row12, row13, row14, row15,460p3, p2, p1, p0, q0, q1, q2, q3);461LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit0, b_limit0, thresh0,462hev, mask);463LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev);464ILVR_B2_SH(p0, p1, q1, q0, tmp0, tmp1);465ILVRL_H2_SH(tmp1, tmp0, tmp2, tmp3);466ILVL_B2_SH(p0, p1, q1, q0, tmp0, tmp1);467ILVRL_H2_SH(tmp1, tmp0, tmp4, tmp5);468src -= 2;469ST4x8_UB(tmp2, tmp3, src, stride);470src += (8 * stride);471ST4x8_UB(tmp4, tmp5, src, stride);472}473474static void HFilter16i(uint8_t* src_y, int stride,475int b_limit, int limit, int thresh) {476HFilterVertEdge16i(src_y + 4, stride, b_limit, limit, thresh);477HFilterVertEdge16i(src_y + 8, stride, b_limit, limit, thresh);478HFilterVertEdge16i(src_y + 12, stride, b_limit, limit, thresh);479}480481// 8-pixels wide variants, for chroma filtering482static void VFilter8(uint8_t* WEBP_RESTRICT src_u, uint8_t* WEBP_RESTRICT src_v,483int stride, int b_limit_in, int limit_in, int thresh_in) {484uint8_t* ptmp_src_u = src_u - 4 * stride;485uint8_t* ptmp_src_v = src_v - 4 * stride;486uint64_t p2_d, p1_d, p0_d, q0_d, q1_d, q2_d;487v16u8 p3, p2, p1, p0, q3, q2, q1, q0, mask, hev;488v16u8 p3_u, p2_u, p1_u, p0_u, q3_u, q2_u, q1_u, q0_u;489v16u8 p3_v, p2_v, p1_v, p0_v, q3_v, q2_v, q1_v, q0_v;490const v16u8 b_limit = (v16u8)__msa_fill_b(b_limit_in);491const v16u8 limit = (v16u8)__msa_fill_b(limit_in);492const v16u8 thresh = (v16u8)__msa_fill_b(thresh_in);493494LD_UB8(ptmp_src_u, stride, p3_u, p2_u, p1_u, p0_u, q0_u, q1_u, q2_u, q3_u);495LD_UB8(ptmp_src_v, stride, p3_v, p2_v, p1_v, p0_v, q0_v, q1_v, q2_v, q3_v);496ILVR_D4_UB(p3_v, p3_u, p2_v, p2_u, p1_v, p1_u, p0_v, p0_u, p3, p2, p1, p0);497ILVR_D4_UB(q0_v, q0_u, q1_v, q1_u, q2_v, q2_u, q3_v, q3_u, q0, q1, q2, q3);498LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,499hev, mask);500LPF_MBFILTER(p2, p1, p0, q0, q1, q2, mask, hev);501p2_d = __msa_copy_s_d((v2i64)p2, 0);502p1_d = __msa_copy_s_d((v2i64)p1, 0);503p0_d = __msa_copy_s_d((v2i64)p0, 0);504q0_d = __msa_copy_s_d((v2i64)q0, 0);505q1_d = __msa_copy_s_d((v2i64)q1, 0);506q2_d = __msa_copy_s_d((v2i64)q2, 0);507ptmp_src_u += stride;508SD4(p2_d, p1_d, p0_d, q0_d, ptmp_src_u, stride);509ptmp_src_u += (4 * stride);510SD(q1_d, ptmp_src_u);511ptmp_src_u += stride;512SD(q2_d, ptmp_src_u);513p2_d = __msa_copy_s_d((v2i64)p2, 1);514p1_d = __msa_copy_s_d((v2i64)p1, 1);515p0_d = __msa_copy_s_d((v2i64)p0, 1);516q0_d = __msa_copy_s_d((v2i64)q0, 1);517q1_d = __msa_copy_s_d((v2i64)q1, 1);518q2_d = __msa_copy_s_d((v2i64)q2, 1);519ptmp_src_v += stride;520SD4(p2_d, p1_d, p0_d, q0_d, ptmp_src_v, stride);521ptmp_src_v += (4 * stride);522SD(q1_d, ptmp_src_v);523ptmp_src_v += stride;524SD(q2_d, ptmp_src_v);525}526527static void HFilter8(uint8_t* WEBP_RESTRICT src_u, uint8_t* WEBP_RESTRICT src_v,528int stride, int b_limit_in, int limit_in, int thresh_in) {529uint8_t* ptmp_src_u = src_u - 4;530uint8_t* ptmp_src_v = src_v - 4;531v16u8 p3, p2, p1, p0, q3, q2, q1, q0, mask, hev;532v16u8 row0, row1, row2, row3, row4, row5, row6, row7, row8;533v16u8 row9, row10, row11, row12, row13, row14, row15;534v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;535const v16u8 b_limit = (v16u8)__msa_fill_b(b_limit_in);536const v16u8 limit = (v16u8)__msa_fill_b(limit_in);537const v16u8 thresh = (v16u8)__msa_fill_b(thresh_in);538539LD_UB8(ptmp_src_u, stride, row0, row1, row2, row3, row4, row5, row6, row7);540LD_UB8(ptmp_src_v, stride,541row8, row9, row10, row11, row12, row13, row14, row15);542TRANSPOSE16x8_UB_UB(row0, row1, row2, row3, row4, row5, row6, row7,543row8, row9, row10, row11, row12, row13, row14, row15,544p3, p2, p1, p0, q0, q1, q2, q3);545LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,546hev, mask);547LPF_MBFILTER(p2, p1, p0, q0, q1, q2, mask, hev);548ILVR_B2_SH(p1, p2, q0, p0, tmp0, tmp1);549ILVRL_H2_SH(tmp1, tmp0, tmp3, tmp4);550ILVL_B2_SH(p1, p2, q0, p0, tmp0, tmp1);551ILVRL_H2_SH(tmp1, tmp0, tmp6, tmp7);552ILVRL_B2_SH(q2, q1, tmp2, tmp5);553ptmp_src_u += 1;554ST6x4_UB(tmp3, 0, tmp2, 0, ptmp_src_u, stride);555ptmp_src_u += 4 * stride;556ST6x4_UB(tmp4, 0, tmp2, 4, ptmp_src_u, stride);557ptmp_src_v += 1;558ST6x4_UB(tmp6, 0, tmp5, 0, ptmp_src_v, stride);559ptmp_src_v += 4 * stride;560ST6x4_UB(tmp7, 0, tmp5, 4, ptmp_src_v, stride);561}562563static void VFilter8i(uint8_t* WEBP_RESTRICT src_u,564uint8_t* WEBP_RESTRICT src_v, int stride,565int b_limit_in, int limit_in, int thresh_in) {566uint64_t p1_d, p0_d, q0_d, q1_d;567v16u8 p3, p2, p1, p0, q3, q2, q1, q0, mask, hev;568v16u8 p3_u, p2_u, p1_u, p0_u, q3_u, q2_u, q1_u, q0_u;569v16u8 p3_v, p2_v, p1_v, p0_v, q3_v, q2_v, q1_v, q0_v;570const v16u8 thresh = (v16u8)__msa_fill_b(thresh_in);571const v16u8 limit = (v16u8)__msa_fill_b(limit_in);572const v16u8 b_limit = (v16u8)__msa_fill_b(b_limit_in);573574LD_UB8(src_u, stride, p3_u, p2_u, p1_u, p0_u, q0_u, q1_u, q2_u, q3_u);575src_u += (5 * stride);576LD_UB8(src_v, stride, p3_v, p2_v, p1_v, p0_v, q0_v, q1_v, q2_v, q3_v);577src_v += (5 * stride);578ILVR_D4_UB(p3_v, p3_u, p2_v, p2_u, p1_v, p1_u, p0_v, p0_u, p3, p2, p1, p0);579ILVR_D4_UB(q0_v, q0_u, q1_v, q1_u, q2_v, q2_u, q3_v, q3_u, q0, q1, q2, q3);580LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,581hev, mask);582LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev);583p1_d = __msa_copy_s_d((v2i64)p1, 0);584p0_d = __msa_copy_s_d((v2i64)p0, 0);585q0_d = __msa_copy_s_d((v2i64)q0, 0);586q1_d = __msa_copy_s_d((v2i64)q1, 0);587SD4(q1_d, q0_d, p0_d, p1_d, src_u, -stride);588p1_d = __msa_copy_s_d((v2i64)p1, 1);589p0_d = __msa_copy_s_d((v2i64)p0, 1);590q0_d = __msa_copy_s_d((v2i64)q0, 1);591q1_d = __msa_copy_s_d((v2i64)q1, 1);592SD4(q1_d, q0_d, p0_d, p1_d, src_v, -stride);593}594595static void HFilter8i(uint8_t* WEBP_RESTRICT src_u,596uint8_t* WEBP_RESTRICT src_v, int stride,597int b_limit_in, int limit_in, int thresh_in) {598v16u8 p3, p2, p1, p0, q3, q2, q1, q0, mask, hev;599v16u8 row0, row1, row2, row3, row4, row5, row6, row7, row8;600v16u8 row9, row10, row11, row12, row13, row14, row15;601v4i32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;602const v16u8 thresh = (v16u8)__msa_fill_b(thresh_in);603const v16u8 limit = (v16u8)__msa_fill_b(limit_in);604const v16u8 b_limit = (v16u8)__msa_fill_b(b_limit_in);605606LD_UB8(src_u, stride, row0, row1, row2, row3, row4, row5, row6, row7);607LD_UB8(src_v, stride,608row8, row9, row10, row11, row12, row13, row14, row15);609TRANSPOSE16x8_UB_UB(row0, row1, row2, row3, row4, row5, row6, row7,610row8, row9, row10, row11, row12, row13, row14, row15,611p3, p2, p1, p0, q0, q1, q2, q3);612LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,613hev, mask);614LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev);615ILVR_B2_SW(p0, p1, q1, q0, tmp0, tmp1);616ILVRL_H2_SW(tmp1, tmp0, tmp2, tmp3);617ILVL_B2_SW(p0, p1, q1, q0, tmp0, tmp1);618ILVRL_H2_SW(tmp1, tmp0, tmp4, tmp5);619src_u += 2;620ST4x4_UB(tmp2, tmp2, 0, 1, 2, 3, src_u, stride);621src_u += 4 * stride;622ST4x4_UB(tmp3, tmp3, 0, 1, 2, 3, src_u, stride);623src_v += 2;624ST4x4_UB(tmp4, tmp4, 0, 1, 2, 3, src_v, stride);625src_v += 4 * stride;626ST4x4_UB(tmp5, tmp5, 0, 1, 2, 3, src_v, stride);627}628629static void SimpleVFilter16(uint8_t* src, int stride, int b_limit_in) {630v16u8 p1, p0, q1, q0, mask;631const v16u8 b_limit = (v16u8)__msa_fill_b(b_limit_in);632633LD_UB4(src - 2 * stride, stride, p1, p0, q0, q1);634LPF_SIMPLE_MASK(p1, p0, q0, q1, b_limit, mask);635LPF_SIMPLE_FILT(p1, p0, q0, q1, mask);636ST_UB2(p0, q0, src - stride, stride);637}638639static void SimpleHFilter16(uint8_t* src, int stride, int b_limit_in) {640v16u8 p1, p0, q1, q0, mask, row0, row1, row2, row3, row4, row5, row6, row7;641v16u8 row8, row9, row10, row11, row12, row13, row14, row15;642v8i16 tmp0, tmp1;643const v16u8 b_limit = (v16u8)__msa_fill_b(b_limit_in);644uint8_t* ptemp_src = src - 2;645646LD_UB8(ptemp_src, stride, row0, row1, row2, row3, row4, row5, row6, row7);647LD_UB8(ptemp_src + 8 * stride, stride,648row8, row9, row10, row11, row12, row13, row14, row15);649TRANSPOSE16x4_UB_UB(row0, row1, row2, row3, row4, row5, row6, row7,650row8, row9, row10, row11, row12, row13, row14, row15,651p1, p0, q0, q1);652LPF_SIMPLE_MASK(p1, p0, q0, q1, b_limit, mask);653LPF_SIMPLE_FILT(p1, p0, q0, q1, mask);654ILVRL_B2_SH(q0, p0, tmp1, tmp0);655ptemp_src += 1;656ST2x4_UB(tmp1, 0, ptemp_src, stride);657ptemp_src += 4 * stride;658ST2x4_UB(tmp1, 4, ptemp_src, stride);659ptemp_src += 4 * stride;660ST2x4_UB(tmp0, 0, ptemp_src, stride);661ptemp_src += 4 * stride;662ST2x4_UB(tmp0, 4, ptemp_src, stride);663ptemp_src += 4 * stride;664}665666static void SimpleVFilter16i(uint8_t* src_y, int stride, int b_limit_in) {667SimpleVFilter16(src_y + 4 * stride, stride, b_limit_in);668SimpleVFilter16(src_y + 8 * stride, stride, b_limit_in);669SimpleVFilter16(src_y + 12 * stride, stride, b_limit_in);670}671672static void SimpleHFilter16i(uint8_t* src_y, int stride, int b_limit_in) {673SimpleHFilter16(src_y + 4, stride, b_limit_in);674SimpleHFilter16(src_y + 8, stride, b_limit_in);675SimpleHFilter16(src_y + 12, stride, b_limit_in);676}677678//------------------------------------------------------------------------------679// Intra predictions680//------------------------------------------------------------------------------681682// 4x4683684static void DC4(uint8_t* dst) { // DC685uint32_t dc = 4;686int i;687for (i = 0; i < 4; ++i) dc += dst[i - BPS] + dst[-1 + i * BPS];688dc >>= 3;689dc = dc | (dc << 8) | (dc << 16) | (dc << 24);690SW4(dc, dc, dc, dc, dst, BPS);691}692693static void TM4(uint8_t* dst) {694const uint8_t* const ptemp = dst - BPS - 1;695v8i16 T, d, r0, r1, r2, r3;696const v16i8 zero = { 0 };697const v8i16 TL = (v8i16)__msa_fill_h(ptemp[0 * BPS]);698const v8i16 L0 = (v8i16)__msa_fill_h(ptemp[1 * BPS]);699const v8i16 L1 = (v8i16)__msa_fill_h(ptemp[2 * BPS]);700const v8i16 L2 = (v8i16)__msa_fill_h(ptemp[3 * BPS]);701const v8i16 L3 = (v8i16)__msa_fill_h(ptemp[4 * BPS]);702const v16u8 T1 = LD_UB(ptemp + 1);703704T = (v8i16)__msa_ilvr_b(zero, (v16i8)T1);705d = T - TL;706ADD4(d, L0, d, L1, d, L2, d, L3, r0, r1, r2, r3);707CLIP_SH4_0_255(r0, r1, r2, r3);708PCKEV_ST4x4_UB(r0, r1, r2, r3, dst, BPS);709}710711static void VE4(uint8_t* dst) { // vertical712const uint8_t* const ptop = dst - BPS - 1;713const uint32_t val0 = LW(ptop + 0);714const uint32_t val1 = LW(ptop + 4);715uint32_t out;716v16u8 A = { 0 }, B, C, AC, B2, R;717718INSERT_W2_UB(val0, val1, A);719B = SLDI_UB(A, A, 1);720C = SLDI_UB(A, A, 2);721AC = __msa_ave_u_b(A, C);722B2 = __msa_ave_u_b(B, B);723R = __msa_aver_u_b(AC, B2);724out = __msa_copy_s_w((v4i32)R, 0);725SW4(out, out, out, out, dst, BPS);726}727728static void RD4(uint8_t* dst) { // Down-right729const uint8_t* const ptop = dst - 1 - BPS;730uint32_t val0 = LW(ptop + 0);731uint32_t val1 = LW(ptop + 4);732uint32_t val2, val3;733v16u8 A, B, C, AC, B2, R, A1 = { 0 };734735INSERT_W2_UB(val0, val1, A1);736A = SLDI_UB(A1, A1, 12);737A = (v16u8)__msa_insert_b((v16i8)A, 3, ptop[1 * BPS]);738A = (v16u8)__msa_insert_b((v16i8)A, 2, ptop[2 * BPS]);739A = (v16u8)__msa_insert_b((v16i8)A, 1, ptop[3 * BPS]);740A = (v16u8)__msa_insert_b((v16i8)A, 0, ptop[4 * BPS]);741B = SLDI_UB(A, A, 1);742C = SLDI_UB(A, A, 2);743AC = __msa_ave_u_b(A, C);744B2 = __msa_ave_u_b(B, B);745R = __msa_aver_u_b(AC, B2);746val3 = __msa_copy_s_w((v4i32)R, 0);747R = SLDI_UB(R, R, 1);748val2 = __msa_copy_s_w((v4i32)R, 0);749R = SLDI_UB(R, R, 1);750val1 = __msa_copy_s_w((v4i32)R, 0);751R = SLDI_UB(R, R, 1);752val0 = __msa_copy_s_w((v4i32)R, 0);753SW4(val0, val1, val2, val3, dst, BPS);754}755756static void LD4(uint8_t* dst) { // Down-Left757const uint8_t* const ptop = dst - BPS;758uint32_t val0 = LW(ptop + 0);759uint32_t val1 = LW(ptop + 4);760uint32_t val2, val3;761v16u8 A = { 0 }, B, C, AC, B2, R;762763INSERT_W2_UB(val0, val1, A);764B = SLDI_UB(A, A, 1);765C = SLDI_UB(A, A, 2);766C = (v16u8)__msa_insert_b((v16i8)C, 6, ptop[7]);767AC = __msa_ave_u_b(A, C);768B2 = __msa_ave_u_b(B, B);769R = __msa_aver_u_b(AC, B2);770val0 = __msa_copy_s_w((v4i32)R, 0);771R = SLDI_UB(R, R, 1);772val1 = __msa_copy_s_w((v4i32)R, 0);773R = SLDI_UB(R, R, 1);774val2 = __msa_copy_s_w((v4i32)R, 0);775R = SLDI_UB(R, R, 1);776val3 = __msa_copy_s_w((v4i32)R, 0);777SW4(val0, val1, val2, val3, dst, BPS);778}779780// 16x16781782static void DC16(uint8_t* dst) { // DC783uint32_t dc = 16;784int i;785const v16u8 rtop = LD_UB(dst - BPS);786const v8u16 dctop = __msa_hadd_u_h(rtop, rtop);787v16u8 out;788789for (i = 0; i < 16; ++i) {790dc += dst[-1 + i * BPS];791}792dc += HADD_UH_U32(dctop);793out = (v16u8)__msa_fill_b(dc >> 5);794ST_UB8(out, out, out, out, out, out, out, out, dst, BPS);795ST_UB8(out, out, out, out, out, out, out, out, dst + 8 * BPS, BPS);796}797798static void TM16(uint8_t* dst) {799int j;800v8i16 d1, d2;801const v16i8 zero = { 0 };802const v8i16 TL = (v8i16)__msa_fill_h(dst[-1 - BPS]);803const v16i8 T = LD_SB(dst - BPS);804805ILVRL_B2_SH(zero, T, d1, d2);806SUB2(d1, TL, d2, TL, d1, d2);807for (j = 0; j < 16; j += 4) {808v16i8 t0, t1, t2, t3;809v8i16 r0, r1, r2, r3, r4, r5, r6, r7;810const v8i16 L0 = (v8i16)__msa_fill_h(dst[-1 + 0 * BPS]);811const v8i16 L1 = (v8i16)__msa_fill_h(dst[-1 + 1 * BPS]);812const v8i16 L2 = (v8i16)__msa_fill_h(dst[-1 + 2 * BPS]);813const v8i16 L3 = (v8i16)__msa_fill_h(dst[-1 + 3 * BPS]);814ADD4(d1, L0, d1, L1, d1, L2, d1, L3, r0, r1, r2, r3);815ADD4(d2, L0, d2, L1, d2, L2, d2, L3, r4, r5, r6, r7);816CLIP_SH4_0_255(r0, r1, r2, r3);817CLIP_SH4_0_255(r4, r5, r6, r7);818PCKEV_B4_SB(r4, r0, r5, r1, r6, r2, r7, r3, t0, t1, t2, t3);819ST_SB4(t0, t1, t2, t3, dst, BPS);820dst += 4 * BPS;821}822}823824static void VE16(uint8_t* dst) { // vertical825const v16u8 rtop = LD_UB(dst - BPS);826ST_UB8(rtop, rtop, rtop, rtop, rtop, rtop, rtop, rtop, dst, BPS);827ST_UB8(rtop, rtop, rtop, rtop, rtop, rtop, rtop, rtop, dst + 8 * BPS, BPS);828}829830static void HE16(uint8_t* dst) { // horizontal831int j;832for (j = 16; j > 0; j -= 4) {833const v16u8 L0 = (v16u8)__msa_fill_b(dst[-1 + 0 * BPS]);834const v16u8 L1 = (v16u8)__msa_fill_b(dst[-1 + 1 * BPS]);835const v16u8 L2 = (v16u8)__msa_fill_b(dst[-1 + 2 * BPS]);836const v16u8 L3 = (v16u8)__msa_fill_b(dst[-1 + 3 * BPS]);837ST_UB4(L0, L1, L2, L3, dst, BPS);838dst += 4 * BPS;839}840}841842static void DC16NoTop(uint8_t* dst) { // DC with top samples not available843int j;844uint32_t dc = 8;845v16u8 out;846847for (j = 0; j < 16; ++j) {848dc += dst[-1 + j * BPS];849}850out = (v16u8)__msa_fill_b(dc >> 4);851ST_UB8(out, out, out, out, out, out, out, out, dst, BPS);852ST_UB8(out, out, out, out, out, out, out, out, dst + 8 * BPS, BPS);853}854855static void DC16NoLeft(uint8_t* dst) { // DC with left samples not available856uint32_t dc = 8;857const v16u8 rtop = LD_UB(dst - BPS);858const v8u16 dctop = __msa_hadd_u_h(rtop, rtop);859v16u8 out;860861dc += HADD_UH_U32(dctop);862out = (v16u8)__msa_fill_b(dc >> 4);863ST_UB8(out, out, out, out, out, out, out, out, dst, BPS);864ST_UB8(out, out, out, out, out, out, out, out, dst + 8 * BPS, BPS);865}866867static void DC16NoTopLeft(uint8_t* dst) { // DC with nothing868const v16u8 out = (v16u8)__msa_fill_b(0x80);869ST_UB8(out, out, out, out, out, out, out, out, dst, BPS);870ST_UB8(out, out, out, out, out, out, out, out, dst + 8 * BPS, BPS);871}872873// Chroma874875#define STORE8x8(out, dst) do { \876SD4(out, out, out, out, dst + 0 * BPS, BPS); \877SD4(out, out, out, out, dst + 4 * BPS, BPS); \878} while (0)879880static void DC8uv(uint8_t* dst) { // DC881uint32_t dc = 8;882int i;883uint64_t out;884const v16u8 rtop = LD_UB(dst - BPS);885const v8u16 temp0 = __msa_hadd_u_h(rtop, rtop);886const v4u32 temp1 = __msa_hadd_u_w(temp0, temp0);887const v2u64 temp2 = __msa_hadd_u_d(temp1, temp1);888v16u8 dctemp;889890for (i = 0; i < 8; ++i) {891dc += dst[-1 + i * BPS];892}893dc += __msa_copy_s_w((v4i32)temp2, 0);894dctemp = (v16u8)__msa_fill_b(dc >> 4);895out = __msa_copy_s_d((v2i64)dctemp, 0);896STORE8x8(out, dst);897}898899static void TM8uv(uint8_t* dst) {900int j;901const v16i8 T1 = LD_SB(dst - BPS);902const v16i8 zero = { 0 };903const v8i16 T = (v8i16)__msa_ilvr_b(zero, T1);904const v8i16 TL = (v8i16)__msa_fill_h(dst[-1 - BPS]);905const v8i16 d = T - TL;906907for (j = 0; j < 8; j += 4) {908v16i8 t0, t1;909v8i16 r0 = (v8i16)__msa_fill_h(dst[-1 + 0 * BPS]);910v8i16 r1 = (v8i16)__msa_fill_h(dst[-1 + 1 * BPS]);911v8i16 r2 = (v8i16)__msa_fill_h(dst[-1 + 2 * BPS]);912v8i16 r3 = (v8i16)__msa_fill_h(dst[-1 + 3 * BPS]);913ADD4(d, r0, d, r1, d, r2, d, r3, r0, r1, r2, r3);914CLIP_SH4_0_255(r0, r1, r2, r3);915PCKEV_B2_SB(r1, r0, r3, r2, t0, t1);916ST4x4_UB(t0, t1, 0, 2, 0, 2, dst, BPS);917ST4x4_UB(t0, t1, 1, 3, 1, 3, dst + 4, BPS);918dst += 4 * BPS;919}920}921922static void VE8uv(uint8_t* dst) { // vertical923const v16u8 rtop = LD_UB(dst - BPS);924const uint64_t out = __msa_copy_s_d((v2i64)rtop, 0);925STORE8x8(out, dst);926}927928static void HE8uv(uint8_t* dst) { // horizontal929int j;930for (j = 0; j < 8; j += 4) {931const v16u8 L0 = (v16u8)__msa_fill_b(dst[-1 + 0 * BPS]);932const v16u8 L1 = (v16u8)__msa_fill_b(dst[-1 + 1 * BPS]);933const v16u8 L2 = (v16u8)__msa_fill_b(dst[-1 + 2 * BPS]);934const v16u8 L3 = (v16u8)__msa_fill_b(dst[-1 + 3 * BPS]);935const uint64_t out0 = __msa_copy_s_d((v2i64)L0, 0);936const uint64_t out1 = __msa_copy_s_d((v2i64)L1, 0);937const uint64_t out2 = __msa_copy_s_d((v2i64)L2, 0);938const uint64_t out3 = __msa_copy_s_d((v2i64)L3, 0);939SD4(out0, out1, out2, out3, dst, BPS);940dst += 4 * BPS;941}942}943944static void DC8uvNoLeft(uint8_t* dst) { // DC with no left samples945const uint32_t dc = 4;946const v16u8 rtop = LD_UB(dst - BPS);947const v8u16 temp0 = __msa_hadd_u_h(rtop, rtop);948const v4u32 temp1 = __msa_hadd_u_w(temp0, temp0);949const v2u64 temp2 = __msa_hadd_u_d(temp1, temp1);950const uint32_t sum_m = __msa_copy_s_w((v4i32)temp2, 0);951const v16u8 dcval = (v16u8)__msa_fill_b((dc + sum_m) >> 3);952const uint64_t out = __msa_copy_s_d((v2i64)dcval, 0);953STORE8x8(out, dst);954}955956static void DC8uvNoTop(uint8_t* dst) { // DC with no top samples957uint32_t dc = 4;958int i;959uint64_t out;960v16u8 dctemp;961962for (i = 0; i < 8; ++i) {963dc += dst[-1 + i * BPS];964}965dctemp = (v16u8)__msa_fill_b(dc >> 3);966out = __msa_copy_s_d((v2i64)dctemp, 0);967STORE8x8(out, dst);968}969970static void DC8uvNoTopLeft(uint8_t* dst) { // DC with nothing971const uint64_t out = 0x8080808080808080ULL;972STORE8x8(out, dst);973}974975//------------------------------------------------------------------------------976// Entry point977978extern void VP8DspInitMSA(void);979980WEBP_TSAN_IGNORE_FUNCTION void VP8DspInitMSA(void) {981VP8TransformWHT = TransformWHT;982VP8Transform = TransformTwo;983VP8TransformDC = TransformDC;984VP8TransformAC3 = TransformAC3;985986VP8VFilter16 = VFilter16;987VP8HFilter16 = HFilter16;988VP8VFilter16i = VFilter16i;989VP8HFilter16i = HFilter16i;990VP8VFilter8 = VFilter8;991VP8HFilter8 = HFilter8;992VP8VFilter8i = VFilter8i;993VP8HFilter8i = HFilter8i;994VP8SimpleVFilter16 = SimpleVFilter16;995VP8SimpleHFilter16 = SimpleHFilter16;996VP8SimpleVFilter16i = SimpleVFilter16i;997VP8SimpleHFilter16i = SimpleHFilter16i;998999VP8PredLuma4[0] = DC4;1000VP8PredLuma4[1] = TM4;1001VP8PredLuma4[2] = VE4;1002VP8PredLuma4[4] = RD4;1003VP8PredLuma4[6] = LD4;1004VP8PredLuma16[0] = DC16;1005VP8PredLuma16[1] = TM16;1006VP8PredLuma16[2] = VE16;1007VP8PredLuma16[3] = HE16;1008VP8PredLuma16[4] = DC16NoTop;1009VP8PredLuma16[5] = DC16NoLeft;1010VP8PredLuma16[6] = DC16NoTopLeft;1011VP8PredChroma8[0] = DC8uv;1012VP8PredChroma8[1] = TM8uv;1013VP8PredChroma8[2] = VE8uv;1014VP8PredChroma8[3] = HE8uv;1015VP8PredChroma8[4] = DC8uvNoTop;1016VP8PredChroma8[5] = DC8uvNoLeft;1017VP8PredChroma8[6] = DC8uvNoTopLeft;1018}10191020#else // !WEBP_USE_MSA10211022WEBP_DSP_INIT_STUB(VP8DspInitMSA)10231024#endif // WEBP_USE_MSA102510261027