Path: blob/master/3rdparty/libwebp/src/dsp/enc_msa.c
16348 views
// Copyright 2016 Google Inc. All Rights Reserved.1//2// Use of this source code is governed by a BSD-style license3// that can be found in the COPYING file in the root of the source4// tree. An additional intellectual property rights grant can be found5// in the file PATENTS. All contributing project authors may6// be found in the AUTHORS file in the root of the source tree.7// -----------------------------------------------------------------------------8//9// MSA version of encoder dsp functions.10//11// Author: Prashant Patil ([email protected])1213#include "src/dsp/dsp.h"1415#if defined(WEBP_USE_MSA)1617#include <stdlib.h>18#include "src/dsp/msa_macro.h"19#include "src/enc/vp8i_enc.h"2021//------------------------------------------------------------------------------22// Transforms2324#define IDCT_1D_W(in0, in1, in2, in3, out0, out1, out2, out3) do { \25v4i32 a1_m, b1_m, c1_m, d1_m; \26const v4i32 cospi8sqrt2minus1 = __msa_fill_w(20091); \27const v4i32 sinpi8sqrt2 = __msa_fill_w(35468); \28v4i32 c_tmp1_m = in1 * sinpi8sqrt2; \29v4i32 c_tmp2_m = in3 * cospi8sqrt2minus1; \30v4i32 d_tmp1_m = in1 * cospi8sqrt2minus1; \31v4i32 d_tmp2_m = in3 * sinpi8sqrt2; \32\33ADDSUB2(in0, in2, a1_m, b1_m); \34SRAI_W2_SW(c_tmp1_m, c_tmp2_m, 16); \35c_tmp2_m = c_tmp2_m + in3; \36c1_m = c_tmp1_m - c_tmp2_m; \37SRAI_W2_SW(d_tmp1_m, d_tmp2_m, 16); \38d_tmp1_m = d_tmp1_m + in1; \39d1_m = d_tmp1_m + d_tmp2_m; \40BUTTERFLY_4(a1_m, b1_m, c1_m, d1_m, out0, out1, out2, out3); \41} while (0)4243static WEBP_INLINE void ITransformOne(const uint8_t* ref, const int16_t* in,44uint8_t* dst) {45v8i16 input0, input1;46v4i32 in0, in1, in2, in3, hz0, hz1, hz2, hz3, vt0, vt1, vt2, vt3;47v4i32 res0, res1, res2, res3;48v16i8 dest0, dest1, dest2, dest3;49const v16i8 zero = { 0 };5051LD_SH2(in, 8, input0, input1);52UNPCK_SH_SW(input0, in0, in1);53UNPCK_SH_SW(input1, in2, in3);54IDCT_1D_W(in0, in1, in2, in3, hz0, hz1, hz2, hz3);55TRANSPOSE4x4_SW_SW(hz0, hz1, hz2, hz3, hz0, hz1, hz2, hz3);56IDCT_1D_W(hz0, hz1, hz2, hz3, vt0, vt1, vt2, vt3);57SRARI_W4_SW(vt0, vt1, vt2, vt3, 3);58TRANSPOSE4x4_SW_SW(vt0, vt1, vt2, vt3, vt0, vt1, vt2, vt3);59LD_SB4(ref, BPS, dest0, dest1, dest2, dest3);60ILVR_B4_SW(zero, dest0, zero, dest1, zero, dest2, zero, dest3,61res0, res1, res2, res3);62ILVR_H4_SW(zero, res0, zero, res1, zero, res2, zero, res3,63res0, res1, res2, res3);64ADD4(res0, vt0, res1, vt1, res2, vt2, res3, vt3, res0, res1, res2, res3);65CLIP_SW4_0_255(res0, res1, res2, res3);66PCKEV_B2_SW(res0, res1, res2, res3, vt0, vt1);67res0 = (v4i32)__msa_pckev_b((v16i8)vt0, (v16i8)vt1);68ST4x4_UB(res0, res0, 3, 2, 1, 0, dst, BPS);69}7071static void ITransform_MSA(const uint8_t* ref, const int16_t* in, uint8_t* dst,72int do_two) {73ITransformOne(ref, in, dst);74if (do_two) {75ITransformOne(ref + 4, in + 16, dst + 4);76}77}7879static void FTransform_MSA(const uint8_t* src, const uint8_t* ref,80int16_t* out) {81uint64_t out0, out1, out2, out3;82uint32_t in0, in1, in2, in3;83v4i32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;84v8i16 t0, t1, t2, t3;85v16u8 srcl0, srcl1, src0 = { 0 }, src1 = { 0 };86const v8i16 mask0 = { 0, 4, 8, 12, 1, 5, 9, 13 };87const v8i16 mask1 = { 3, 7, 11, 15, 2, 6, 10, 14 };88const v8i16 mask2 = { 4, 0, 5, 1, 6, 2, 7, 3 };89const v8i16 mask3 = { 0, 4, 1, 5, 2, 6, 3, 7 };90const v8i16 cnst0 = { 2217, -5352, 2217, -5352, 2217, -5352, 2217, -5352 };91const v8i16 cnst1 = { 5352, 2217, 5352, 2217, 5352, 2217, 5352, 2217 };9293LW4(src, BPS, in0, in1, in2, in3);94INSERT_W4_UB(in0, in1, in2, in3, src0);95LW4(ref, BPS, in0, in1, in2, in3);96INSERT_W4_UB(in0, in1, in2, in3, src1);97ILVRL_B2_UB(src0, src1, srcl0, srcl1);98HSUB_UB2_SH(srcl0, srcl1, t0, t1);99VSHF_H2_SH(t0, t1, t0, t1, mask0, mask1, t2, t3);100ADDSUB2(t2, t3, t0, t1);101t0 = SRLI_H(t0, 3);102VSHF_H2_SH(t0, t0, t1, t1, mask2, mask3, t3, t2);103tmp0 = __msa_hadd_s_w(t3, t3);104tmp2 = __msa_hsub_s_w(t3, t3);105FILL_W2_SW(1812, 937, tmp1, tmp3);106DPADD_SH2_SW(t2, t2, cnst0, cnst1, tmp3, tmp1);107SRAI_W2_SW(tmp1, tmp3, 9);108PCKEV_H2_SH(tmp1, tmp0, tmp3, tmp2, t0, t1);109VSHF_H2_SH(t0, t1, t0, t1, mask0, mask1, t2, t3);110ADDSUB2(t2, t3, t0, t1);111VSHF_H2_SH(t0, t0, t1, t1, mask2, mask3, t3, t2);112tmp0 = __msa_hadd_s_w(t3, t3);113tmp2 = __msa_hsub_s_w(t3, t3);114ADDVI_W2_SW(tmp0, 7, tmp2, 7, tmp0, tmp2);115SRAI_W2_SW(tmp0, tmp2, 4);116FILL_W2_SW(12000, 51000, tmp1, tmp3);117DPADD_SH2_SW(t2, t2, cnst0, cnst1, tmp3, tmp1);118SRAI_W2_SW(tmp1, tmp3, 16);119UNPCK_R_SH_SW(t1, tmp4);120tmp5 = __msa_ceqi_w(tmp4, 0);121tmp4 = (v4i32)__msa_nor_v((v16u8)tmp5, (v16u8)tmp5);122tmp5 = __msa_fill_w(1);123tmp5 = (v4i32)__msa_and_v((v16u8)tmp5, (v16u8)tmp4);124tmp1 += tmp5;125PCKEV_H2_SH(tmp1, tmp0, tmp3, tmp2, t0, t1);126out0 = __msa_copy_s_d((v2i64)t0, 0);127out1 = __msa_copy_s_d((v2i64)t0, 1);128out2 = __msa_copy_s_d((v2i64)t1, 0);129out3 = __msa_copy_s_d((v2i64)t1, 1);130SD4(out0, out1, out2, out3, out, 8);131}132133static void FTransformWHT_MSA(const int16_t* in, int16_t* out) {134v8i16 in0 = { 0 };135v8i16 in1 = { 0 };136v8i16 tmp0, tmp1, tmp2, tmp3;137v8i16 out0, out1;138const v8i16 mask0 = { 0, 1, 2, 3, 8, 9, 10, 11 };139const v8i16 mask1 = { 4, 5, 6, 7, 12, 13, 14, 15 };140const v8i16 mask2 = { 0, 4, 8, 12, 1, 5, 9, 13 };141const v8i16 mask3 = { 3, 7, 11, 15, 2, 6, 10, 14 };142143in0 = __msa_insert_h(in0, 0, in[ 0]);144in0 = __msa_insert_h(in0, 1, in[ 64]);145in0 = __msa_insert_h(in0, 2, in[128]);146in0 = __msa_insert_h(in0, 3, in[192]);147in0 = __msa_insert_h(in0, 4, in[ 16]);148in0 = __msa_insert_h(in0, 5, in[ 80]);149in0 = __msa_insert_h(in0, 6, in[144]);150in0 = __msa_insert_h(in0, 7, in[208]);151in1 = __msa_insert_h(in1, 0, in[ 48]);152in1 = __msa_insert_h(in1, 1, in[112]);153in1 = __msa_insert_h(in1, 2, in[176]);154in1 = __msa_insert_h(in1, 3, in[240]);155in1 = __msa_insert_h(in1, 4, in[ 32]);156in1 = __msa_insert_h(in1, 5, in[ 96]);157in1 = __msa_insert_h(in1, 6, in[160]);158in1 = __msa_insert_h(in1, 7, in[224]);159ADDSUB2(in0, in1, tmp0, tmp1);160VSHF_H2_SH(tmp0, tmp1, tmp0, tmp1, mask0, mask1, tmp2, tmp3);161ADDSUB2(tmp2, tmp3, tmp0, tmp1);162VSHF_H2_SH(tmp0, tmp1, tmp0, tmp1, mask2, mask3, in0, in1);163ADDSUB2(in0, in1, tmp0, tmp1);164VSHF_H2_SH(tmp0, tmp1, tmp0, tmp1, mask0, mask1, tmp2, tmp3);165ADDSUB2(tmp2, tmp3, out0, out1);166SRAI_H2_SH(out0, out1, 1);167ST_SH2(out0, out1, out, 8);168}169170static int TTransform_MSA(const uint8_t* in, const uint16_t* w) {171int sum;172uint32_t in0_m, in1_m, in2_m, in3_m;173v16i8 src0 = { 0 };174v8i16 in0, in1, tmp0, tmp1, tmp2, tmp3;175v4i32 dst0, dst1;176const v16i8 zero = { 0 };177const v8i16 mask0 = { 0, 1, 2, 3, 8, 9, 10, 11 };178const v8i16 mask1 = { 4, 5, 6, 7, 12, 13, 14, 15 };179const v8i16 mask2 = { 0, 4, 8, 12, 1, 5, 9, 13 };180const v8i16 mask3 = { 3, 7, 11, 15, 2, 6, 10, 14 };181182LW4(in, BPS, in0_m, in1_m, in2_m, in3_m);183INSERT_W4_SB(in0_m, in1_m, in2_m, in3_m, src0);184ILVRL_B2_SH(zero, src0, tmp0, tmp1);185VSHF_H2_SH(tmp0, tmp1, tmp0, tmp1, mask2, mask3, in0, in1);186ADDSUB2(in0, in1, tmp0, tmp1);187VSHF_H2_SH(tmp0, tmp1, tmp0, tmp1, mask0, mask1, tmp2, tmp3);188ADDSUB2(tmp2, tmp3, tmp0, tmp1);189VSHF_H2_SH(tmp0, tmp1, tmp0, tmp1, mask2, mask3, in0, in1);190ADDSUB2(in0, in1, tmp0, tmp1);191VSHF_H2_SH(tmp0, tmp1, tmp0, tmp1, mask0, mask1, tmp2, tmp3);192ADDSUB2(tmp2, tmp3, tmp0, tmp1);193tmp0 = __msa_add_a_h(tmp0, (v8i16)zero);194tmp1 = __msa_add_a_h(tmp1, (v8i16)zero);195LD_SH2(w, 8, tmp2, tmp3);196DOTP_SH2_SW(tmp0, tmp1, tmp2, tmp3, dst0, dst1);197dst0 = dst0 + dst1;198sum = HADD_SW_S32(dst0);199return sum;200}201202static int Disto4x4_MSA(const uint8_t* const a, const uint8_t* const b,203const uint16_t* const w) {204const int sum1 = TTransform_MSA(a, w);205const int sum2 = TTransform_MSA(b, w);206return abs(sum2 - sum1) >> 5;207}208209static int Disto16x16_MSA(const uint8_t* const a, const uint8_t* const b,210const uint16_t* const w) {211int D = 0;212int x, y;213for (y = 0; y < 16 * BPS; y += 4 * BPS) {214for (x = 0; x < 16; x += 4) {215D += Disto4x4_MSA(a + x + y, b + x + y, w);216}217}218return D;219}220221//------------------------------------------------------------------------------222// Histogram223224static void CollectHistogram_MSA(const uint8_t* ref, const uint8_t* pred,225int start_block, int end_block,226VP8Histogram* const histo) {227int j;228int distribution[MAX_COEFF_THRESH + 1] = { 0 };229for (j = start_block; j < end_block; ++j) {230int16_t out[16];231VP8FTransform(ref + VP8DspScan[j], pred + VP8DspScan[j], out);232{233int k;234v8i16 coeff0, coeff1;235const v8i16 zero = { 0 };236const v8i16 max_coeff_thr = __msa_ldi_h(MAX_COEFF_THRESH);237LD_SH2(&out[0], 8, coeff0, coeff1);238coeff0 = __msa_add_a_h(coeff0, zero);239coeff1 = __msa_add_a_h(coeff1, zero);240SRAI_H2_SH(coeff0, coeff1, 3);241coeff0 = __msa_min_s_h(coeff0, max_coeff_thr);242coeff1 = __msa_min_s_h(coeff1, max_coeff_thr);243ST_SH2(coeff0, coeff1, &out[0], 8);244for (k = 0; k < 16; ++k) {245++distribution[out[k]];246}247}248}249VP8SetHistogramData(distribution, histo);250}251252//------------------------------------------------------------------------------253// Intra predictions254255// luma 4x4 prediction256257#define DST(x, y) dst[(x) + (y) * BPS]258#define AVG3(a, b, c) (((a) + 2 * (b) + (c) + 2) >> 2)259#define AVG2(a, b) (((a) + (b) + 1) >> 1)260261static WEBP_INLINE void VE4(uint8_t* dst, const uint8_t* top) { // vertical262const v16u8 A1 = { 0 };263const uint64_t val_m = LD(top - 1);264const v16u8 A = (v16u8)__msa_insert_d((v2i64)A1, 0, val_m);265const v16u8 B = SLDI_UB(A, A, 1);266const v16u8 C = SLDI_UB(A, A, 2);267const v16u8 AC = __msa_ave_u_b(A, C);268const v16u8 B2 = __msa_ave_u_b(B, B);269const v16u8 R = __msa_aver_u_b(AC, B2);270const uint32_t out = __msa_copy_s_w((v4i32)R, 0);271SW4(out, out, out, out, dst, BPS);272}273274static WEBP_INLINE void HE4(uint8_t* dst, const uint8_t* top) { // horizontal275const int X = top[-1];276const int I = top[-2];277const int J = top[-3];278const int K = top[-4];279const int L = top[-5];280WebPUint32ToMem(dst + 0 * BPS, 0x01010101U * AVG3(X, I, J));281WebPUint32ToMem(dst + 1 * BPS, 0x01010101U * AVG3(I, J, K));282WebPUint32ToMem(dst + 2 * BPS, 0x01010101U * AVG3(J, K, L));283WebPUint32ToMem(dst + 3 * BPS, 0x01010101U * AVG3(K, L, L));284}285286static WEBP_INLINE void DC4(uint8_t* dst, const uint8_t* top) {287uint32_t dc = 4;288int i;289for (i = 0; i < 4; ++i) dc += top[i] + top[-5 + i];290dc >>= 3;291dc = dc | (dc << 8) | (dc << 16) | (dc << 24);292SW4(dc, dc, dc, dc, dst, BPS);293}294295static WEBP_INLINE void RD4(uint8_t* dst, const uint8_t* top) {296const v16u8 A2 = { 0 };297const uint64_t val_m = LD(top - 5);298const v16u8 A1 = (v16u8)__msa_insert_d((v2i64)A2, 0, val_m);299const v16u8 A = (v16u8)__msa_insert_b((v16i8)A1, 8, top[3]);300const v16u8 B = SLDI_UB(A, A, 1);301const v16u8 C = SLDI_UB(A, A, 2);302const v16u8 AC = __msa_ave_u_b(A, C);303const v16u8 B2 = __msa_ave_u_b(B, B);304const v16u8 R0 = __msa_aver_u_b(AC, B2);305const v16u8 R1 = SLDI_UB(R0, R0, 1);306const v16u8 R2 = SLDI_UB(R1, R1, 1);307const v16u8 R3 = SLDI_UB(R2, R2, 1);308const uint32_t val0 = __msa_copy_s_w((v4i32)R0, 0);309const uint32_t val1 = __msa_copy_s_w((v4i32)R1, 0);310const uint32_t val2 = __msa_copy_s_w((v4i32)R2, 0);311const uint32_t val3 = __msa_copy_s_w((v4i32)R3, 0);312SW4(val3, val2, val1, val0, dst, BPS);313}314315static WEBP_INLINE void LD4(uint8_t* dst, const uint8_t* top) {316const v16u8 A1 = { 0 };317const uint64_t val_m = LD(top);318const v16u8 A = (v16u8)__msa_insert_d((v2i64)A1, 0, val_m);319const v16u8 B = SLDI_UB(A, A, 1);320const v16u8 C1 = SLDI_UB(A, A, 2);321const v16u8 C = (v16u8)__msa_insert_b((v16i8)C1, 6, top[7]);322const v16u8 AC = __msa_ave_u_b(A, C);323const v16u8 B2 = __msa_ave_u_b(B, B);324const v16u8 R0 = __msa_aver_u_b(AC, B2);325const v16u8 R1 = SLDI_UB(R0, R0, 1);326const v16u8 R2 = SLDI_UB(R1, R1, 1);327const v16u8 R3 = SLDI_UB(R2, R2, 1);328const uint32_t val0 = __msa_copy_s_w((v4i32)R0, 0);329const uint32_t val1 = __msa_copy_s_w((v4i32)R1, 0);330const uint32_t val2 = __msa_copy_s_w((v4i32)R2, 0);331const uint32_t val3 = __msa_copy_s_w((v4i32)R3, 0);332SW4(val0, val1, val2, val3, dst, BPS);333}334335static WEBP_INLINE void VR4(uint8_t* dst, const uint8_t* top) {336const int X = top[-1];337const int I = top[-2];338const int J = top[-3];339const int K = top[-4];340const int A = top[0];341const int B = top[1];342const int C = top[2];343const int D = top[3];344DST(0, 0) = DST(1, 2) = AVG2(X, A);345DST(1, 0) = DST(2, 2) = AVG2(A, B);346DST(2, 0) = DST(3, 2) = AVG2(B, C);347DST(3, 0) = AVG2(C, D);348DST(0, 3) = AVG3(K, J, I);349DST(0, 2) = AVG3(J, I, X);350DST(0, 1) = DST(1, 3) = AVG3(I, X, A);351DST(1, 1) = DST(2, 3) = AVG3(X, A, B);352DST(2, 1) = DST(3, 3) = AVG3(A, B, C);353DST(3, 1) = AVG3(B, C, D);354}355356static WEBP_INLINE void VL4(uint8_t* dst, const uint8_t* top) {357const int A = top[0];358const int B = top[1];359const int C = top[2];360const int D = top[3];361const int E = top[4];362const int F = top[5];363const int G = top[6];364const int H = top[7];365DST(0, 0) = AVG2(A, B);366DST(1, 0) = DST(0, 2) = AVG2(B, C);367DST(2, 0) = DST(1, 2) = AVG2(C, D);368DST(3, 0) = DST(2, 2) = AVG2(D, E);369DST(0, 1) = AVG3(A, B, C);370DST(1, 1) = DST(0, 3) = AVG3(B, C, D);371DST(2, 1) = DST(1, 3) = AVG3(C, D, E);372DST(3, 1) = DST(2, 3) = AVG3(D, E, F);373DST(3, 2) = AVG3(E, F, G);374DST(3, 3) = AVG3(F, G, H);375}376377static WEBP_INLINE void HU4(uint8_t* dst, const uint8_t* top) {378const int I = top[-2];379const int J = top[-3];380const int K = top[-4];381const int L = top[-5];382DST(0, 0) = AVG2(I, J);383DST(2, 0) = DST(0, 1) = AVG2(J, K);384DST(2, 1) = DST(0, 2) = AVG2(K, L);385DST(1, 0) = AVG3(I, J, K);386DST(3, 0) = DST(1, 1) = AVG3(J, K, L);387DST(3, 1) = DST(1, 2) = AVG3(K, L, L);388DST(3, 2) = DST(2, 2) =389DST(0, 3) = DST(1, 3) = DST(2, 3) = DST(3, 3) = L;390}391392static WEBP_INLINE void HD4(uint8_t* dst, const uint8_t* top) {393const int X = top[-1];394const int I = top[-2];395const int J = top[-3];396const int K = top[-4];397const int L = top[-5];398const int A = top[0];399const int B = top[1];400const int C = top[2];401DST(0, 0) = DST(2, 1) = AVG2(I, X);402DST(0, 1) = DST(2, 2) = AVG2(J, I);403DST(0, 2) = DST(2, 3) = AVG2(K, J);404DST(0, 3) = AVG2(L, K);405DST(3, 0) = AVG3(A, B, C);406DST(2, 0) = AVG3(X, A, B);407DST(1, 0) = DST(3, 1) = AVG3(I, X, A);408DST(1, 1) = DST(3, 2) = AVG3(J, I, X);409DST(1, 2) = DST(3, 3) = AVG3(K, J, I);410DST(1, 3) = AVG3(L, K, J);411}412413static WEBP_INLINE void TM4(uint8_t* dst, const uint8_t* top) {414const v16i8 zero = { 0 };415const v8i16 TL = (v8i16)__msa_fill_h(top[-1]);416const v8i16 L0 = (v8i16)__msa_fill_h(top[-2]);417const v8i16 L1 = (v8i16)__msa_fill_h(top[-3]);418const v8i16 L2 = (v8i16)__msa_fill_h(top[-4]);419const v8i16 L3 = (v8i16)__msa_fill_h(top[-5]);420const v16u8 T1 = LD_UB(top);421const v8i16 T = (v8i16)__msa_ilvr_b(zero, (v16i8)T1);422const v8i16 d = T - TL;423v8i16 r0, r1, r2, r3;424ADD4(d, L0, d, L1, d, L2, d, L3, r0, r1, r2, r3);425CLIP_SH4_0_255(r0, r1, r2, r3);426PCKEV_ST4x4_UB(r0, r1, r2, r3, dst, BPS);427}428429#undef DST430#undef AVG3431#undef AVG2432433static void Intra4Preds_MSA(uint8_t* dst, const uint8_t* top) {434DC4(I4DC4 + dst, top);435TM4(I4TM4 + dst, top);436VE4(I4VE4 + dst, top);437HE4(I4HE4 + dst, top);438RD4(I4RD4 + dst, top);439VR4(I4VR4 + dst, top);440LD4(I4LD4 + dst, top);441VL4(I4VL4 + dst, top);442HD4(I4HD4 + dst, top);443HU4(I4HU4 + dst, top);444}445446// luma 16x16 prediction447448#define STORE16x16(out, dst) do { \449ST_UB8(out, out, out, out, out, out, out, out, dst + 0 * BPS, BPS); \450ST_UB8(out, out, out, out, out, out, out, out, dst + 8 * BPS, BPS); \451} while (0)452453static WEBP_INLINE void VerticalPred16x16(uint8_t* dst, const uint8_t* top) {454if (top != NULL) {455const v16u8 out = LD_UB(top);456STORE16x16(out, dst);457} else {458const v16u8 out = (v16u8)__msa_fill_b(0x7f);459STORE16x16(out, dst);460}461}462463static WEBP_INLINE void HorizontalPred16x16(uint8_t* dst,464const uint8_t* left) {465if (left != NULL) {466int j;467for (j = 0; j < 16; j += 4) {468const v16u8 L0 = (v16u8)__msa_fill_b(left[0]);469const v16u8 L1 = (v16u8)__msa_fill_b(left[1]);470const v16u8 L2 = (v16u8)__msa_fill_b(left[2]);471const v16u8 L3 = (v16u8)__msa_fill_b(left[3]);472ST_UB4(L0, L1, L2, L3, dst, BPS);473dst += 4 * BPS;474left += 4;475}476} else {477const v16u8 out = (v16u8)__msa_fill_b(0x81);478STORE16x16(out, dst);479}480}481482static WEBP_INLINE void TrueMotion16x16(uint8_t* dst, const uint8_t* left,483const uint8_t* top) {484if (left != NULL) {485if (top != NULL) {486int j;487v8i16 d1, d2;488const v16i8 zero = { 0 };489const v8i16 TL = (v8i16)__msa_fill_h(left[-1]);490const v16u8 T = LD_UB(top);491ILVRL_B2_SH(zero, T, d1, d2);492SUB2(d1, TL, d2, TL, d1, d2);493for (j = 0; j < 16; j += 4) {494v16i8 t0, t1, t2, t3;495v8i16 r0, r1, r2, r3, r4, r5, r6, r7;496const v8i16 L0 = (v8i16)__msa_fill_h(left[j + 0]);497const v8i16 L1 = (v8i16)__msa_fill_h(left[j + 1]);498const v8i16 L2 = (v8i16)__msa_fill_h(left[j + 2]);499const v8i16 L3 = (v8i16)__msa_fill_h(left[j + 3]);500ADD4(d1, L0, d1, L1, d1, L2, d1, L3, r0, r1, r2, r3);501ADD4(d2, L0, d2, L1, d2, L2, d2, L3, r4, r5, r6, r7);502CLIP_SH4_0_255(r0, r1, r2, r3);503CLIP_SH4_0_255(r4, r5, r6, r7);504PCKEV_B4_SB(r4, r0, r5, r1, r6, r2, r7, r3, t0, t1, t2, t3);505ST_SB4(t0, t1, t2, t3, dst, BPS);506dst += 4 * BPS;507}508} else {509HorizontalPred16x16(dst, left);510}511} else {512if (top != NULL) {513VerticalPred16x16(dst, top);514} else {515const v16u8 out = (v16u8)__msa_fill_b(0x81);516STORE16x16(out, dst);517}518}519}520521static WEBP_INLINE void DCMode16x16(uint8_t* dst, const uint8_t* left,522const uint8_t* top) {523int DC;524v16u8 out;525if (top != NULL && left != NULL) {526const v16u8 rtop = LD_UB(top);527const v8u16 dctop = __msa_hadd_u_h(rtop, rtop);528const v16u8 rleft = LD_UB(left);529const v8u16 dcleft = __msa_hadd_u_h(rleft, rleft);530const v8u16 dctemp = dctop + dcleft;531DC = HADD_UH_U32(dctemp);532DC = (DC + 16) >> 5;533} else if (left != NULL) { // left but no top534const v16u8 rleft = LD_UB(left);535const v8u16 dcleft = __msa_hadd_u_h(rleft, rleft);536DC = HADD_UH_U32(dcleft);537DC = (DC + DC + 16) >> 5;538} else if (top != NULL) { // top but no left539const v16u8 rtop = LD_UB(top);540const v8u16 dctop = __msa_hadd_u_h(rtop, rtop);541DC = HADD_UH_U32(dctop);542DC = (DC + DC + 16) >> 5;543} else { // no top, no left, nothing.544DC = 0x80;545}546out = (v16u8)__msa_fill_b(DC);547STORE16x16(out, dst);548}549550static void Intra16Preds_MSA(uint8_t* dst,551const uint8_t* left, const uint8_t* top) {552DCMode16x16(I16DC16 + dst, left, top);553VerticalPred16x16(I16VE16 + dst, top);554HorizontalPred16x16(I16HE16 + dst, left);555TrueMotion16x16(I16TM16 + dst, left, top);556}557558// Chroma 8x8 prediction559560#define CALC_DC8(in, out) do { \561const v8u16 temp0 = __msa_hadd_u_h(in, in); \562const v4u32 temp1 = __msa_hadd_u_w(temp0, temp0); \563const v2i64 temp2 = (v2i64)__msa_hadd_u_d(temp1, temp1); \564const v2i64 temp3 = __msa_splati_d(temp2, 1); \565const v2i64 temp4 = temp3 + temp2; \566const v16i8 temp5 = (v16i8)__msa_srari_d(temp4, 4); \567const v2i64 temp6 = (v2i64)__msa_splati_b(temp5, 0); \568out = __msa_copy_s_d(temp6, 0); \569} while (0)570571#define STORE8x8(out, dst) do { \572SD4(out, out, out, out, dst + 0 * BPS, BPS); \573SD4(out, out, out, out, dst + 4 * BPS, BPS); \574} while (0)575576static WEBP_INLINE void VerticalPred8x8(uint8_t* dst, const uint8_t* top) {577if (top != NULL) {578const uint64_t out = LD(top);579STORE8x8(out, dst);580} else {581const uint64_t out = 0x7f7f7f7f7f7f7f7fULL;582STORE8x8(out, dst);583}584}585586static WEBP_INLINE void HorizontalPred8x8(uint8_t* dst, const uint8_t* left) {587if (left != NULL) {588int j;589for (j = 0; j < 8; j += 4) {590const v16u8 L0 = (v16u8)__msa_fill_b(left[0]);591const v16u8 L1 = (v16u8)__msa_fill_b(left[1]);592const v16u8 L2 = (v16u8)__msa_fill_b(left[2]);593const v16u8 L3 = (v16u8)__msa_fill_b(left[3]);594const uint64_t out0 = __msa_copy_s_d((v2i64)L0, 0);595const uint64_t out1 = __msa_copy_s_d((v2i64)L1, 0);596const uint64_t out2 = __msa_copy_s_d((v2i64)L2, 0);597const uint64_t out3 = __msa_copy_s_d((v2i64)L3, 0);598SD4(out0, out1, out2, out3, dst, BPS);599dst += 4 * BPS;600left += 4;601}602} else {603const uint64_t out = 0x8181818181818181ULL;604STORE8x8(out, dst);605}606}607608static WEBP_INLINE void TrueMotion8x8(uint8_t* dst, const uint8_t* left,609const uint8_t* top) {610if (left != NULL) {611if (top != NULL) {612int j;613const v8i16 TL = (v8i16)__msa_fill_h(left[-1]);614const v16u8 T1 = LD_UB(top);615const v16i8 zero = { 0 };616const v8i16 T = (v8i16)__msa_ilvr_b(zero, (v16i8)T1);617const v8i16 d = T - TL;618for (j = 0; j < 8; j += 4) {619uint64_t out0, out1, out2, out3;620v16i8 t0, t1;621v8i16 r0 = (v8i16)__msa_fill_h(left[j + 0]);622v8i16 r1 = (v8i16)__msa_fill_h(left[j + 1]);623v8i16 r2 = (v8i16)__msa_fill_h(left[j + 2]);624v8i16 r3 = (v8i16)__msa_fill_h(left[j + 3]);625ADD4(d, r0, d, r1, d, r2, d, r3, r0, r1, r2, r3);626CLIP_SH4_0_255(r0, r1, r2, r3);627PCKEV_B2_SB(r1, r0, r3, r2, t0, t1);628out0 = __msa_copy_s_d((v2i64)t0, 0);629out1 = __msa_copy_s_d((v2i64)t0, 1);630out2 = __msa_copy_s_d((v2i64)t1, 0);631out3 = __msa_copy_s_d((v2i64)t1, 1);632SD4(out0, out1, out2, out3, dst, BPS);633dst += 4 * BPS;634}635} else {636HorizontalPred8x8(dst, left);637}638} else {639if (top != NULL) {640VerticalPred8x8(dst, top);641} else {642const uint64_t out = 0x8181818181818181ULL;643STORE8x8(out, dst);644}645}646}647648static WEBP_INLINE void DCMode8x8(uint8_t* dst, const uint8_t* left,649const uint8_t* top) {650uint64_t out;651v16u8 src = { 0 };652if (top != NULL && left != NULL) {653const uint64_t left_m = LD(left);654const uint64_t top_m = LD(top);655INSERT_D2_UB(left_m, top_m, src);656CALC_DC8(src, out);657} else if (left != NULL) { // left but no top658const uint64_t left_m = LD(left);659INSERT_D2_UB(left_m, left_m, src);660CALC_DC8(src, out);661} else if (top != NULL) { // top but no left662const uint64_t top_m = LD(top);663INSERT_D2_UB(top_m, top_m, src);664CALC_DC8(src, out);665} else { // no top, no left, nothing.666src = (v16u8)__msa_fill_b(0x80);667out = __msa_copy_s_d((v2i64)src, 0);668}669STORE8x8(out, dst);670}671672static void IntraChromaPreds_MSA(uint8_t* dst, const uint8_t* left,673const uint8_t* top) {674// U block675DCMode8x8(C8DC8 + dst, left, top);676VerticalPred8x8(C8VE8 + dst, top);677HorizontalPred8x8(C8HE8 + dst, left);678TrueMotion8x8(C8TM8 + dst, left, top);679// V block680dst += 8;681if (top != NULL) top += 8;682if (left != NULL) left += 16;683DCMode8x8(C8DC8 + dst, left, top);684VerticalPred8x8(C8VE8 + dst, top);685HorizontalPred8x8(C8HE8 + dst, left);686TrueMotion8x8(C8TM8 + dst, left, top);687}688689//------------------------------------------------------------------------------690// Metric691692#define PACK_DOTP_UB4_SW(in0, in1, in2, in3, out0, out1, out2, out3) do { \693v16u8 tmp0, tmp1; \694v8i16 tmp2, tmp3; \695ILVRL_B2_UB(in0, in1, tmp0, tmp1); \696HSUB_UB2_SH(tmp0, tmp1, tmp2, tmp3); \697DOTP_SH2_SW(tmp2, tmp3, tmp2, tmp3, out0, out1); \698ILVRL_B2_UB(in2, in3, tmp0, tmp1); \699HSUB_UB2_SH(tmp0, tmp1, tmp2, tmp3); \700DOTP_SH2_SW(tmp2, tmp3, tmp2, tmp3, out2, out3); \701} while (0)702703#define PACK_DPADD_UB4_SW(in0, in1, in2, in3, out0, out1, out2, out3) do { \704v16u8 tmp0, tmp1; \705v8i16 tmp2, tmp3; \706ILVRL_B2_UB(in0, in1, tmp0, tmp1); \707HSUB_UB2_SH(tmp0, tmp1, tmp2, tmp3); \708DPADD_SH2_SW(tmp2, tmp3, tmp2, tmp3, out0, out1); \709ILVRL_B2_UB(in2, in3, tmp0, tmp1); \710HSUB_UB2_SH(tmp0, tmp1, tmp2, tmp3); \711DPADD_SH2_SW(tmp2, tmp3, tmp2, tmp3, out2, out3); \712} while (0)713714static int SSE16x16_MSA(const uint8_t* a, const uint8_t* b) {715uint32_t sum;716v16u8 src0, src1, src2, src3, src4, src5, src6, src7;717v16u8 ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7;718v4i32 out0, out1, out2, out3;719720LD_UB8(a, BPS, src0, src1, src2, src3, src4, src5, src6, src7);721LD_UB8(b, BPS, ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7);722PACK_DOTP_UB4_SW(src0, ref0, src1, ref1, out0, out1, out2, out3);723PACK_DPADD_UB4_SW(src2, ref2, src3, ref3, out0, out1, out2, out3);724PACK_DPADD_UB4_SW(src4, ref4, src5, ref5, out0, out1, out2, out3);725PACK_DPADD_UB4_SW(src6, ref6, src7, ref7, out0, out1, out2, out3);726a += 8 * BPS;727b += 8 * BPS;728LD_UB8(a, BPS, src0, src1, src2, src3, src4, src5, src6, src7);729LD_UB8(b, BPS, ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7);730PACK_DPADD_UB4_SW(src0, ref0, src1, ref1, out0, out1, out2, out3);731PACK_DPADD_UB4_SW(src2, ref2, src3, ref3, out0, out1, out2, out3);732PACK_DPADD_UB4_SW(src4, ref4, src5, ref5, out0, out1, out2, out3);733PACK_DPADD_UB4_SW(src6, ref6, src7, ref7, out0, out1, out2, out3);734out0 += out1;735out2 += out3;736out0 += out2;737sum = HADD_SW_S32(out0);738return sum;739}740741static int SSE16x8_MSA(const uint8_t* a, const uint8_t* b) {742uint32_t sum;743v16u8 src0, src1, src2, src3, src4, src5, src6, src7;744v16u8 ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7;745v4i32 out0, out1, out2, out3;746747LD_UB8(a, BPS, src0, src1, src2, src3, src4, src5, src6, src7);748LD_UB8(b, BPS, ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7);749PACK_DOTP_UB4_SW(src0, ref0, src1, ref1, out0, out1, out2, out3);750PACK_DPADD_UB4_SW(src2, ref2, src3, ref3, out0, out1, out2, out3);751PACK_DPADD_UB4_SW(src4, ref4, src5, ref5, out0, out1, out2, out3);752PACK_DPADD_UB4_SW(src6, ref6, src7, ref7, out0, out1, out2, out3);753out0 += out1;754out2 += out3;755out0 += out2;756sum = HADD_SW_S32(out0);757return sum;758}759760static int SSE8x8_MSA(const uint8_t* a, const uint8_t* b) {761uint32_t sum;762v16u8 src0, src1, src2, src3, src4, src5, src6, src7;763v16u8 ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7;764v16u8 t0, t1, t2, t3;765v4i32 out0, out1, out2, out3;766767LD_UB8(a, BPS, src0, src1, src2, src3, src4, src5, src6, src7);768LD_UB8(b, BPS, ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7);769ILVR_B4_UB(src0, src1, src2, src3, ref0, ref1, ref2, ref3, t0, t1, t2, t3);770PACK_DOTP_UB4_SW(t0, t2, t1, t3, out0, out1, out2, out3);771ILVR_B4_UB(src4, src5, src6, src7, ref4, ref5, ref6, ref7, t0, t1, t2, t3);772PACK_DPADD_UB4_SW(t0, t2, t1, t3, out0, out1, out2, out3);773out0 += out1;774out2 += out3;775out0 += out2;776sum = HADD_SW_S32(out0);777return sum;778}779780static int SSE4x4_MSA(const uint8_t* a, const uint8_t* b) {781uint32_t sum = 0;782uint32_t src0, src1, src2, src3, ref0, ref1, ref2, ref3;783v16u8 src = { 0 }, ref = { 0 }, tmp0, tmp1;784v8i16 diff0, diff1;785v4i32 out0, out1;786787LW4(a, BPS, src0, src1, src2, src3);788LW4(b, BPS, ref0, ref1, ref2, ref3);789INSERT_W4_UB(src0, src1, src2, src3, src);790INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);791ILVRL_B2_UB(src, ref, tmp0, tmp1);792HSUB_UB2_SH(tmp0, tmp1, diff0, diff1);793DOTP_SH2_SW(diff0, diff1, diff0, diff1, out0, out1);794out0 += out1;795sum = HADD_SW_S32(out0);796return sum;797}798799//------------------------------------------------------------------------------800// Quantization801802static int QuantizeBlock_MSA(int16_t in[16], int16_t out[16],803const VP8Matrix* const mtx) {804int sum;805v8i16 in0, in1, sh0, sh1, out0, out1;806v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, sign0, sign1;807v4i32 s0, s1, s2, s3, b0, b1, b2, b3, t0, t1, t2, t3;808const v8i16 zero = { 0 };809const v8i16 zigzag0 = { 0, 1, 4, 8, 5, 2, 3, 6 };810const v8i16 zigzag1 = { 9, 12, 13, 10, 7, 11, 14, 15 };811const v8i16 maxlevel = __msa_fill_h(MAX_LEVEL);812813LD_SH2(&in[0], 8, in0, in1);814LD_SH2(&mtx->sharpen_[0], 8, sh0, sh1);815tmp4 = __msa_add_a_h(in0, zero);816tmp5 = __msa_add_a_h(in1, zero);817ILVRL_H2_SH(sh0, tmp4, tmp0, tmp1);818ILVRL_H2_SH(sh1, tmp5, tmp2, tmp3);819HADD_SH4_SW(tmp0, tmp1, tmp2, tmp3, s0, s1, s2, s3);820sign0 = (in0 < zero);821sign1 = (in1 < zero); // sign822LD_SH2(&mtx->iq_[0], 8, tmp0, tmp1); // iq823ILVRL_H2_SW(zero, tmp0, t0, t1);824ILVRL_H2_SW(zero, tmp1, t2, t3);825LD_SW4(&mtx->bias_[0], 4, b0, b1, b2, b3); // bias826MUL4(t0, s0, t1, s1, t2, s2, t3, s3, t0, t1, t2, t3);827ADD4(b0, t0, b1, t1, b2, t2, b3, t3, b0, b1, b2, b3);828SRAI_W4_SW(b0, b1, b2, b3, 17);829PCKEV_H2_SH(b1, b0, b3, b2, tmp2, tmp3);830tmp0 = (tmp2 > maxlevel);831tmp1 = (tmp3 > maxlevel);832tmp2 = (v8i16)__msa_bmnz_v((v16u8)tmp2, (v16u8)maxlevel, (v16u8)tmp0);833tmp3 = (v8i16)__msa_bmnz_v((v16u8)tmp3, (v16u8)maxlevel, (v16u8)tmp1);834SUB2(zero, tmp2, zero, tmp3, tmp0, tmp1);835tmp2 = (v8i16)__msa_bmnz_v((v16u8)tmp2, (v16u8)tmp0, (v16u8)sign0);836tmp3 = (v8i16)__msa_bmnz_v((v16u8)tmp3, (v16u8)tmp1, (v16u8)sign1);837LD_SW4(&mtx->zthresh_[0], 4, t0, t1, t2, t3); // zthresh838t0 = (s0 > t0);839t1 = (s1 > t1);840t2 = (s2 > t2);841t3 = (s3 > t3);842PCKEV_H2_SH(t1, t0, t3, t2, tmp0, tmp1);843tmp4 = (v8i16)__msa_bmnz_v((v16u8)zero, (v16u8)tmp2, (v16u8)tmp0);844tmp5 = (v8i16)__msa_bmnz_v((v16u8)zero, (v16u8)tmp3, (v16u8)tmp1);845LD_SH2(&mtx->q_[0], 8, tmp0, tmp1);846MUL2(tmp4, tmp0, tmp5, tmp1, in0, in1);847VSHF_H2_SH(tmp4, tmp5, tmp4, tmp5, zigzag0, zigzag1, out0, out1);848ST_SH2(in0, in1, &in[0], 8);849ST_SH2(out0, out1, &out[0], 8);850out0 = __msa_add_a_h(out0, out1);851sum = HADD_SH_S32(out0);852return (sum > 0);853}854855static int Quantize2Blocks_MSA(int16_t in[32], int16_t out[32],856const VP8Matrix* const mtx) {857int nz;858nz = VP8EncQuantizeBlock(in + 0 * 16, out + 0 * 16, mtx) << 0;859nz |= VP8EncQuantizeBlock(in + 1 * 16, out + 1 * 16, mtx) << 1;860return nz;861}862863//------------------------------------------------------------------------------864// Entry point865866extern void VP8EncDspInitMSA(void);867868WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspInitMSA(void) {869VP8ITransform = ITransform_MSA;870VP8FTransform = FTransform_MSA;871VP8FTransformWHT = FTransformWHT_MSA;872873VP8TDisto4x4 = Disto4x4_MSA;874VP8TDisto16x16 = Disto16x16_MSA;875VP8CollectHistogram = CollectHistogram_MSA;876877VP8EncPredLuma4 = Intra4Preds_MSA;878VP8EncPredLuma16 = Intra16Preds_MSA;879VP8EncPredChroma8 = IntraChromaPreds_MSA;880881VP8SSE16x16 = SSE16x16_MSA;882VP8SSE16x8 = SSE16x8_MSA;883VP8SSE8x8 = SSE8x8_MSA;884VP8SSE4x4 = SSE4x4_MSA;885886VP8EncQuantizeBlock = QuantizeBlock_MSA;887VP8EncQuantize2Blocks = Quantize2Blocks_MSA;888VP8EncQuantizeBlockWHT = QuantizeBlock_MSA;889}890891#else // !WEBP_USE_MSA892893WEBP_DSP_INIT_STUB(VP8EncDspInitMSA)894895#endif // WEBP_USE_MSA896897898