Path: blob/master/thirdparty/libwebp/src/dsp/dec_mips_dsp_r2.c
9913 views
// Copyright 2014 Google Inc. All Rights Reserved.1//2// Use of this source code is governed by a BSD-style license3// that can be found in the COPYING file in the root of the source4// tree. An additional intellectual property rights grant can be found5// in the file PATENTS. All contributing project authors may6// be found in the AUTHORS file in the root of the source tree.7// -----------------------------------------------------------------------------8//9// MIPS version of dsp functions10//11// Author(s): Djordje Pesut ([email protected])12// Jovan Zelincevic ([email protected])1314#include "src/dsp/dsp.h"1516#if defined(WEBP_USE_MIPS_DSP_R2)1718#include "src/dsp/mips_macro.h"1920static const int kC1 = WEBP_TRANSFORM_AC3_C1;21static const int kC2 = WEBP_TRANSFORM_AC3_C2;2223static void TransformDC(const int16_t* WEBP_RESTRICT in,24uint8_t* WEBP_RESTRICT dst) {25int temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8, temp9, temp10;2627__asm__ volatile (28LOAD_WITH_OFFSET_X4(temp1, temp2, temp3, temp4, dst,290, 0, 0, 0,300, 1, 2, 3,31BPS)32"lh %[temp5], 0(%[in]) \n\t"33"addiu %[temp5], %[temp5], 4 \n\t"34"ins %[temp5], %[temp5], 16, 16 \n\t"35"shra.ph %[temp5], %[temp5], 3 \n\t"36CONVERT_2_BYTES_TO_HALF(temp6, temp7, temp8, temp9, temp10, temp1, temp2,37temp3, temp1, temp2, temp3, temp4)38STORE_SAT_SUM_X2(temp6, temp7, temp8, temp9, temp10, temp1, temp2, temp3,39temp5, temp5, temp5, temp5, temp5, temp5, temp5, temp5,40dst, 0, 1, 2, 3, BPS)4142OUTPUT_EARLY_CLOBBER_REGS_10()43: [in]"r"(in), [dst]"r"(dst)44: "memory"45);46}4748static void TransformAC3(const int16_t* WEBP_RESTRICT in,49uint8_t* WEBP_RESTRICT dst) {50const int a = in[0] + 4;51int c4 = WEBP_TRANSFORM_AC3_MUL2(in[4]);52const int d4 = WEBP_TRANSFORM_AC3_MUL1(in[4]);53const int c1 = WEBP_TRANSFORM_AC3_MUL2(in[1]);54const int d1 = WEBP_TRANSFORM_AC3_MUL1(in[1]);55int temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8, temp9;56int temp10, temp11, temp12, temp13, temp14, temp15, temp16, temp17, temp18;5758__asm__ volatile (59"ins %[c4], %[d4], 16, 16 \n\t"60"replv.ph %[temp1], %[a] \n\t"61"replv.ph %[temp4], %[d1] \n\t"62ADD_SUB_HALVES(temp2, temp3, temp1, c4)63"replv.ph %[temp5], %[c1] \n\t"64SHIFT_R_SUM_X2(temp1, temp6, temp7, temp8, temp2, temp9, temp10, temp4,65temp2, temp2, temp3, temp3, temp4, temp5, temp4, temp5)66LOAD_WITH_OFFSET_X4(temp3, temp5, temp11, temp12, dst,670, 0, 0, 0,680, 1, 2, 3,69BPS)70CONVERT_2_BYTES_TO_HALF(temp13, temp14, temp3, temp15, temp5, temp16,71temp11, temp17, temp3, temp5, temp11, temp12)72PACK_2_HALVES_TO_WORD(temp12, temp18, temp7, temp6, temp1, temp8, temp2,73temp4, temp7, temp6, temp10, temp9)74STORE_SAT_SUM_X2(temp13, temp14, temp3, temp15, temp5, temp16, temp11,75temp17, temp12, temp18, temp1, temp8, temp2, temp4,76temp7, temp6, dst, 0, 1, 2, 3, BPS)7778OUTPUT_EARLY_CLOBBER_REGS_18(),79[c4]"+&r"(c4)80: [dst]"r"(dst), [a]"r"(a), [d1]"r"(d1), [d4]"r"(d4), [c1]"r"(c1)81: "memory"82);83}8485static void TransformOne(const int16_t* WEBP_RESTRICT in,86uint8_t* WEBP_RESTRICT dst) {87int temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8, temp9;88int temp10, temp11, temp12, temp13, temp14, temp15, temp16, temp17, temp18;8990__asm__ volatile (91"ulw %[temp1], 0(%[in]) \n\t"92"ulw %[temp2], 16(%[in]) \n\t"93LOAD_IN_X2(temp5, temp6, 24, 26)94ADD_SUB_HALVES(temp3, temp4, temp1, temp2)95LOAD_IN_X2(temp1, temp2, 8, 10)96MUL_SHIFT_SUM(temp7, temp8, temp9, temp10, temp11, temp12, temp13, temp14,97temp10, temp8, temp9, temp7, temp1, temp2, temp5, temp6,98temp13, temp11, temp14, temp12)99INSERT_HALF_X2(temp8, temp7, temp10, temp9)100"ulw %[temp17], 4(%[in]) \n\t"101"ulw %[temp18], 20(%[in]) \n\t"102ADD_SUB_HALVES(temp1, temp2, temp3, temp8)103ADD_SUB_HALVES(temp5, temp6, temp4, temp7)104ADD_SUB_HALVES(temp7, temp8, temp17, temp18)105LOAD_IN_X2(temp17, temp18, 12, 14)106LOAD_IN_X2(temp9, temp10, 28, 30)107MUL_SHIFT_SUM(temp11, temp12, temp13, temp14, temp15, temp16, temp4, temp17,108temp12, temp14, temp11, temp13, temp17, temp18, temp9, temp10,109temp15, temp4, temp16, temp17)110INSERT_HALF_X2(temp11, temp12, temp13, temp14)111ADD_SUB_HALVES(temp17, temp8, temp8, temp11)112ADD_SUB_HALVES(temp3, temp4, temp7, temp12)113114// horizontal115SRA_16(temp9, temp10, temp11, temp12, temp1, temp2, temp5, temp6)116INSERT_HALF_X2(temp1, temp6, temp5, temp2)117SRA_16(temp13, temp14, temp15, temp16, temp3, temp4, temp17, temp8)118"repl.ph %[temp2], 0x4 \n\t"119INSERT_HALF_X2(temp3, temp8, temp17, temp4)120"addq.ph %[temp1], %[temp1], %[temp2] \n\t"121"addq.ph %[temp6], %[temp6], %[temp2] \n\t"122ADD_SUB_HALVES(temp2, temp4, temp1, temp3)123ADD_SUB_HALVES(temp5, temp7, temp6, temp8)124MUL_SHIFT_SUM(temp1, temp3, temp6, temp8, temp9, temp13, temp17, temp18,125temp3, temp13, temp1, temp9, temp9, temp13, temp11, temp15,126temp6, temp17, temp8, temp18)127MUL_SHIFT_SUM(temp6, temp8, temp18, temp17, temp11, temp15, temp12, temp16,128temp8, temp15, temp6, temp11, temp12, temp16, temp10, temp14,129temp18, temp12, temp17, temp16)130INSERT_HALF_X2(temp1, temp3, temp9, temp13)131INSERT_HALF_X2(temp6, temp8, temp11, temp15)132SHIFT_R_SUM_X2(temp9, temp10, temp11, temp12, temp13, temp14, temp15,133temp16, temp2, temp4, temp5, temp7, temp3, temp1, temp8,134temp6)135PACK_2_HALVES_TO_WORD(temp1, temp2, temp3, temp4, temp9, temp12, temp13,136temp16, temp11, temp10, temp15, temp14)137LOAD_WITH_OFFSET_X4(temp10, temp11, temp14, temp15, dst,1380, 0, 0, 0,1390, 1, 2, 3,140BPS)141CONVERT_2_BYTES_TO_HALF(temp5, temp6, temp7, temp8, temp17, temp18, temp10,142temp11, temp10, temp11, temp14, temp15)143STORE_SAT_SUM_X2(temp5, temp6, temp7, temp8, temp17, temp18, temp10, temp11,144temp9, temp12, temp1, temp2, temp13, temp16, temp3, temp4,145dst, 0, 1, 2, 3, BPS)146147OUTPUT_EARLY_CLOBBER_REGS_18()148: [dst]"r"(dst), [in]"r"(in), [kC1]"r"(kC1), [kC2]"r"(kC2)149: "memory", "hi", "lo"150);151}152153static void TransformTwo(const int16_t* WEBP_RESTRICT in,154uint8_t* WEBP_RESTRICT dst, int do_two) {155TransformOne(in, dst);156if (do_two) {157TransformOne(in + 16, dst + 4);158}159}160161static WEBP_INLINE void FilterLoop26(uint8_t* p,162int hstride, int vstride, int size,163int thresh, int ithresh, int hev_thresh) {164const int thresh2 = 2 * thresh + 1;165int temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8, temp9;166int temp10, temp11, temp12, temp13, temp14, temp15;167168__asm__ volatile (169".set push \n\t"170".set noreorder \n\t"171"1: \n\t"172"negu %[temp1], %[hstride] \n\t"173"addiu %[size], %[size], -1 \n\t"174"sll %[temp2], %[hstride], 1 \n\t"175"sll %[temp3], %[temp1], 1 \n\t"176"addu %[temp4], %[temp2], %[hstride] \n\t"177"addu %[temp5], %[temp3], %[temp1] \n\t"178"lbu %[temp7], 0(%[p]) \n\t"179"sll %[temp6], %[temp3], 1 \n\t"180"lbux %[temp8], %[temp5](%[p]) \n\t"181"lbux %[temp9], %[temp3](%[p]) \n\t"182"lbux %[temp10], %[temp1](%[p]) \n\t"183"lbux %[temp11], %[temp6](%[p]) \n\t"184"lbux %[temp12], %[hstride](%[p]) \n\t"185"lbux %[temp13], %[temp2](%[p]) \n\t"186"lbux %[temp14], %[temp4](%[p]) \n\t"187"subu %[temp1], %[temp10], %[temp7] \n\t"188"subu %[temp2], %[temp9], %[temp12] \n\t"189"absq_s.w %[temp3], %[temp1] \n\t"190"absq_s.w %[temp4], %[temp2] \n\t"191"negu %[temp1], %[temp1] \n\t"192"sll %[temp3], %[temp3], 2 \n\t"193"addu %[temp15], %[temp3], %[temp4] \n\t"194"subu %[temp3], %[temp15], %[thresh2] \n\t"195"sll %[temp6], %[temp1], 1 \n\t"196"bgtz %[temp3], 3f \n\t"197" subu %[temp4], %[temp11], %[temp8] \n\t"198"absq_s.w %[temp4], %[temp4] \n\t"199"shll_s.w %[temp2], %[temp2], 24 \n\t"200"subu %[temp4], %[temp4], %[ithresh] \n\t"201"bgtz %[temp4], 3f \n\t"202" subu %[temp3], %[temp8], %[temp9] \n\t"203"absq_s.w %[temp3], %[temp3] \n\t"204"subu %[temp3], %[temp3], %[ithresh] \n\t"205"bgtz %[temp3], 3f \n\t"206" subu %[temp5], %[temp9], %[temp10] \n\t"207"absq_s.w %[temp3], %[temp5] \n\t"208"absq_s.w %[temp5], %[temp5] \n\t"209"subu %[temp3], %[temp3], %[ithresh] \n\t"210"bgtz %[temp3], 3f \n\t"211" subu %[temp3], %[temp14], %[temp13] \n\t"212"absq_s.w %[temp3], %[temp3] \n\t"213"slt %[temp5], %[hev_thresh], %[temp5] \n\t"214"subu %[temp3], %[temp3], %[ithresh] \n\t"215"bgtz %[temp3], 3f \n\t"216" subu %[temp3], %[temp13], %[temp12] \n\t"217"absq_s.w %[temp3], %[temp3] \n\t"218"sra %[temp4], %[temp2], 24 \n\t"219"subu %[temp3], %[temp3], %[ithresh] \n\t"220"bgtz %[temp3], 3f \n\t"221" subu %[temp15], %[temp12], %[temp7] \n\t"222"absq_s.w %[temp3], %[temp15] \n\t"223"absq_s.w %[temp15], %[temp15] \n\t"224"subu %[temp3], %[temp3], %[ithresh] \n\t"225"bgtz %[temp3], 3f \n\t"226" slt %[temp15], %[hev_thresh], %[temp15] \n\t"227"addu %[temp3], %[temp6], %[temp1] \n\t"228"or %[temp2], %[temp5], %[temp15] \n\t"229"addu %[temp5], %[temp4], %[temp3] \n\t"230"beqz %[temp2], 4f \n\t"231" shra_r.w %[temp1], %[temp5], 3 \n\t"232"addiu %[temp2], %[temp5], 3 \n\t"233"sra %[temp2], %[temp2], 3 \n\t"234"shll_s.w %[temp1], %[temp1], 27 \n\t"235"shll_s.w %[temp2], %[temp2], 27 \n\t"236"subu %[temp3], %[p], %[hstride] \n\t"237"sra %[temp1], %[temp1], 27 \n\t"238"sra %[temp2], %[temp2], 27 \n\t"239"subu %[temp1], %[temp7], %[temp1] \n\t"240"addu %[temp2], %[temp10], %[temp2] \n\t"241"lbux %[temp2], %[temp2](%[VP8kclip1]) \n\t"242"lbux %[temp1], %[temp1](%[VP8kclip1]) \n\t"243"sb %[temp2], 0(%[temp3]) \n\t"244"j 3f \n\t"245" sb %[temp1], 0(%[p]) \n\t"246"4: \n\t"247"shll_s.w %[temp5], %[temp5], 24 \n\t"248"subu %[temp14], %[p], %[hstride] \n\t"249"subu %[temp11], %[temp14], %[hstride] \n\t"250"sra %[temp6], %[temp5], 24 \n\t"251"sll %[temp1], %[temp6], 3 \n\t"252"subu %[temp15], %[temp11], %[hstride] \n\t"253"addu %[temp2], %[temp6], %[temp1] \n\t"254"sll %[temp3], %[temp2], 1 \n\t"255"addu %[temp4], %[temp3], %[temp2] \n\t"256"addiu %[temp2], %[temp2], 63 \n\t"257"addiu %[temp3], %[temp3], 63 \n\t"258"addiu %[temp4], %[temp4], 63 \n\t"259"sra %[temp2], %[temp2], 7 \n\t"260"sra %[temp3], %[temp3], 7 \n\t"261"sra %[temp4], %[temp4], 7 \n\t"262"addu %[temp1], %[temp8], %[temp2] \n\t"263"addu %[temp5], %[temp9], %[temp3] \n\t"264"addu %[temp6], %[temp10], %[temp4] \n\t"265"subu %[temp8], %[temp7], %[temp4] \n\t"266"subu %[temp7], %[temp12], %[temp3] \n\t"267"addu %[temp10], %[p], %[hstride] \n\t"268"subu %[temp9], %[temp13], %[temp2] \n\t"269"addu %[temp12], %[temp10], %[hstride] \n\t"270"lbux %[temp2], %[temp1](%[VP8kclip1]) \n\t"271"lbux %[temp3], %[temp5](%[VP8kclip1]) \n\t"272"lbux %[temp4], %[temp6](%[VP8kclip1]) \n\t"273"lbux %[temp5], %[temp8](%[VP8kclip1]) \n\t"274"lbux %[temp6], %[temp7](%[VP8kclip1]) \n\t"275"lbux %[temp8], %[temp9](%[VP8kclip1]) \n\t"276"sb %[temp2], 0(%[temp15]) \n\t"277"sb %[temp3], 0(%[temp11]) \n\t"278"sb %[temp4], 0(%[temp14]) \n\t"279"sb %[temp5], 0(%[p]) \n\t"280"sb %[temp6], 0(%[temp10]) \n\t"281"sb %[temp8], 0(%[temp12]) \n\t"282"3: \n\t"283"bgtz %[size], 1b \n\t"284" addu %[p], %[p], %[vstride] \n\t"285".set pop \n\t"286: [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),[temp3]"=&r"(temp3),287[temp4]"=&r"(temp4), [temp5]"=&r"(temp5), [temp6]"=&r"(temp6),288[temp7]"=&r"(temp7),[temp8]"=&r"(temp8),[temp9]"=&r"(temp9),289[temp10]"=&r"(temp10),[temp11]"=&r"(temp11),[temp12]"=&r"(temp12),290[temp13]"=&r"(temp13),[temp14]"=&r"(temp14),[temp15]"=&r"(temp15),291[size]"+&r"(size), [p]"+&r"(p)292: [hstride]"r"(hstride), [thresh2]"r"(thresh2),293[ithresh]"r"(ithresh),[vstride]"r"(vstride), [hev_thresh]"r"(hev_thresh),294[VP8kclip1]"r"(VP8kclip1)295: "memory"296);297}298299static WEBP_INLINE void FilterLoop24(uint8_t* p,300int hstride, int vstride, int size,301int thresh, int ithresh, int hev_thresh) {302int p0, q0, p1, q1, p2, q2, p3, q3;303int step1, step2, temp1, temp2, temp3, temp4;304uint8_t* pTemp0;305uint8_t* pTemp1;306const int thresh2 = 2 * thresh + 1;307308__asm__ volatile (309".set push \n\t"310".set noreorder \n\t"311"bltz %[size], 3f \n\t"312" nop \n\t"313"2: \n\t"314"negu %[step1], %[hstride] \n\t"315"lbu %[q0], 0(%[p]) \n\t"316"lbux %[p0], %[step1](%[p]) \n\t"317"subu %[step1], %[step1], %[hstride] \n\t"318"lbux %[q1], %[hstride](%[p]) \n\t"319"subu %[temp1], %[p0], %[q0] \n\t"320"lbux %[p1], %[step1](%[p]) \n\t"321"addu %[step2], %[hstride], %[hstride] \n\t"322"absq_s.w %[temp2], %[temp1] \n\t"323"subu %[temp3], %[p1], %[q1] \n\t"324"absq_s.w %[temp4], %[temp3] \n\t"325"sll %[temp2], %[temp2], 2 \n\t"326"addu %[temp2], %[temp2], %[temp4] \n\t"327"subu %[temp4], %[temp2], %[thresh2] \n\t"328"subu %[step1], %[step1], %[hstride] \n\t"329"bgtz %[temp4], 0f \n\t"330" lbux %[p2], %[step1](%[p]) \n\t"331"subu %[step1], %[step1], %[hstride] \n\t"332"lbux %[q2], %[step2](%[p]) \n\t"333"lbux %[p3], %[step1](%[p]) \n\t"334"subu %[temp4], %[p2], %[p1] \n\t"335"addu %[step2], %[step2], %[hstride] \n\t"336"subu %[temp2], %[p3], %[p2] \n\t"337"absq_s.w %[temp4], %[temp4] \n\t"338"absq_s.w %[temp2], %[temp2] \n\t"339"lbux %[q3], %[step2](%[p]) \n\t"340"subu %[temp4], %[temp4], %[ithresh] \n\t"341"negu %[temp1], %[temp1] \n\t"342"bgtz %[temp4], 0f \n\t"343" subu %[temp2], %[temp2], %[ithresh] \n\t"344"subu %[p3], %[p1], %[p0] \n\t"345"bgtz %[temp2], 0f \n\t"346" absq_s.w %[p3], %[p3] \n\t"347"subu %[temp4], %[q3], %[q2] \n\t"348"subu %[pTemp0], %[p], %[hstride] \n\t"349"absq_s.w %[temp4], %[temp4] \n\t"350"subu %[temp2], %[p3], %[ithresh] \n\t"351"sll %[step1], %[temp1], 1 \n\t"352"bgtz %[temp2], 0f \n\t"353" subu %[temp4], %[temp4], %[ithresh] \n\t"354"subu %[temp2], %[q2], %[q1] \n\t"355"bgtz %[temp4], 0f \n\t"356" absq_s.w %[temp2], %[temp2] \n\t"357"subu %[q3], %[q1], %[q0] \n\t"358"absq_s.w %[q3], %[q3] \n\t"359"subu %[temp2], %[temp2], %[ithresh] \n\t"360"addu %[temp1], %[temp1], %[step1] \n\t"361"bgtz %[temp2], 0f \n\t"362" subu %[temp4], %[q3], %[ithresh] \n\t"363"slt %[p3], %[hev_thresh], %[p3] \n\t"364"bgtz %[temp4], 0f \n\t"365" slt %[q3], %[hev_thresh], %[q3] \n\t"366"or %[q3], %[q3], %[p3] \n\t"367"bgtz %[q3], 1f \n\t"368" shra_r.w %[temp2], %[temp1], 3 \n\t"369"addiu %[temp1], %[temp1], 3 \n\t"370"sra %[temp1], %[temp1], 3 \n\t"371"shll_s.w %[temp2], %[temp2], 27 \n\t"372"shll_s.w %[temp1], %[temp1], 27 \n\t"373"addu %[pTemp1], %[p], %[hstride] \n\t"374"sra %[temp2], %[temp2], 27 \n\t"375"sra %[temp1], %[temp1], 27 \n\t"376"addiu %[step1], %[temp2], 1 \n\t"377"sra %[step1], %[step1], 1 \n\t"378"addu %[p0], %[p0], %[temp1] \n\t"379"addu %[p1], %[p1], %[step1] \n\t"380"subu %[q0], %[q0], %[temp2] \n\t"381"subu %[q1], %[q1], %[step1] \n\t"382"lbux %[temp2], %[p0](%[VP8kclip1]) \n\t"383"lbux %[temp3], %[q0](%[VP8kclip1]) \n\t"384"lbux %[temp4], %[q1](%[VP8kclip1]) \n\t"385"sb %[temp2], 0(%[pTemp0]) \n\t"386"lbux %[temp1], %[p1](%[VP8kclip1]) \n\t"387"subu %[pTemp0], %[pTemp0], %[hstride] \n\t"388"sb %[temp3], 0(%[p]) \n\t"389"sb %[temp4], 0(%[pTemp1]) \n\t"390"j 0f \n\t"391" sb %[temp1], 0(%[pTemp0]) \n\t"392"1: \n\t"393"shll_s.w %[temp3], %[temp3], 24 \n\t"394"sra %[temp3], %[temp3], 24 \n\t"395"addu %[temp1], %[temp1], %[temp3] \n\t"396"shra_r.w %[temp2], %[temp1], 3 \n\t"397"addiu %[temp1], %[temp1], 3 \n\t"398"shll_s.w %[temp2], %[temp2], 27 \n\t"399"sra %[temp1], %[temp1], 3 \n\t"400"shll_s.w %[temp1], %[temp1], 27 \n\t"401"sra %[temp2], %[temp2], 27 \n\t"402"sra %[temp1], %[temp1], 27 \n\t"403"addu %[p0], %[p0], %[temp1] \n\t"404"subu %[q0], %[q0], %[temp2] \n\t"405"lbux %[temp1], %[p0](%[VP8kclip1]) \n\t"406"lbux %[temp2], %[q0](%[VP8kclip1]) \n\t"407"sb %[temp2], 0(%[p]) \n\t"408"sb %[temp1], 0(%[pTemp0]) \n\t"409"0: \n\t"410"subu %[size], %[size], 1 \n\t"411"bgtz %[size], 2b \n\t"412" addu %[p], %[p], %[vstride] \n\t"413"3: \n\t"414".set pop \n\t"415: [p0]"=&r"(p0), [q0]"=&r"(q0), [p1]"=&r"(p1), [q1]"=&r"(q1),416[p2]"=&r"(p2), [q2]"=&r"(q2), [p3]"=&r"(p3), [q3]"=&r"(q3),417[step2]"=&r"(step2), [step1]"=&r"(step1), [temp1]"=&r"(temp1),418[temp2]"=&r"(temp2), [temp3]"=&r"(temp3), [temp4]"=&r"(temp4),419[pTemp0]"=&r"(pTemp0), [pTemp1]"=&r"(pTemp1), [p]"+&r"(p),420[size]"+&r"(size)421: [vstride]"r"(vstride), [ithresh]"r"(ithresh),422[hev_thresh]"r"(hev_thresh), [hstride]"r"(hstride),423[VP8kclip1]"r"(VP8kclip1), [thresh2]"r"(thresh2)424: "memory"425);426}427428// on macroblock edges429static void VFilter16(uint8_t* p, int stride,430int thresh, int ithresh, int hev_thresh) {431FilterLoop26(p, stride, 1, 16, thresh, ithresh, hev_thresh);432}433434static void HFilter16(uint8_t* p, int stride,435int thresh, int ithresh, int hev_thresh) {436FilterLoop26(p, 1, stride, 16, thresh, ithresh, hev_thresh);437}438439// 8-pixels wide variant, for chroma filtering440static void VFilter8(uint8_t* WEBP_RESTRICT u, uint8_t* WEBP_RESTRICT v,441int stride, int thresh, int ithresh, int hev_thresh) {442FilterLoop26(u, stride, 1, 8, thresh, ithresh, hev_thresh);443FilterLoop26(v, stride, 1, 8, thresh, ithresh, hev_thresh);444}445446static void HFilter8(uint8_t* WEBP_RESTRICT u, uint8_t* WEBP_RESTRICT v,447int stride, int thresh, int ithresh, int hev_thresh) {448FilterLoop26(u, 1, stride, 8, thresh, ithresh, hev_thresh);449FilterLoop26(v, 1, stride, 8, thresh, ithresh, hev_thresh);450}451452// on three inner edges453static void VFilter16i(uint8_t* p, int stride,454int thresh, int ithresh, int hev_thresh) {455int k;456for (k = 3; k > 0; --k) {457p += 4 * stride;458FilterLoop24(p, stride, 1, 16, thresh, ithresh, hev_thresh);459}460}461462static void HFilter16i(uint8_t* p, int stride,463int thresh, int ithresh, int hev_thresh) {464int k;465for (k = 3; k > 0; --k) {466p += 4;467FilterLoop24(p, 1, stride, 16, thresh, ithresh, hev_thresh);468}469}470471static void VFilter8i(uint8_t* WEBP_RESTRICT u, uint8_t* WEBP_RESTRICT v,472int stride, int thresh, int ithresh, int hev_thresh) {473FilterLoop24(u + 4 * stride, stride, 1, 8, thresh, ithresh, hev_thresh);474FilterLoop24(v + 4 * stride, stride, 1, 8, thresh, ithresh, hev_thresh);475}476477static void HFilter8i(uint8_t* WEBP_RESTRICT u, uint8_t* WEBP_RESTRICT v,478int stride, int thresh, int ithresh, int hev_thresh) {479FilterLoop24(u + 4, 1, stride, 8, thresh, ithresh, hev_thresh);480FilterLoop24(v + 4, 1, stride, 8, thresh, ithresh, hev_thresh);481}482483//------------------------------------------------------------------------------484// Simple In-loop filtering (Paragraph 15.2)485486static void SimpleVFilter16(uint8_t* p, int stride, int thresh) {487int i;488const int thresh2 = 2 * thresh + 1;489int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8;490uint8_t* p1 = p - stride;491__asm__ volatile (492".set push \n\t"493".set noreorder \n\t"494"li %[i], 16 \n\t"495"0: \n\t"496"negu %[temp4], %[stride] \n\t"497"sll %[temp5], %[temp4], 1 \n\t"498"lbu %[temp2], 0(%[p]) \n\t"499"lbux %[temp3], %[stride](%[p]) \n\t"500"lbux %[temp1], %[temp4](%[p]) \n\t"501"lbux %[temp0], %[temp5](%[p]) \n\t"502"subu %[temp7], %[temp1], %[temp2] \n\t"503"subu %[temp6], %[temp0], %[temp3] \n\t"504"absq_s.w %[temp4], %[temp7] \n\t"505"absq_s.w %[temp5], %[temp6] \n\t"506"sll %[temp4], %[temp4], 2 \n\t"507"subu %[temp5], %[temp5], %[thresh2] \n\t"508"addu %[temp5], %[temp4], %[temp5] \n\t"509"negu %[temp8], %[temp7] \n\t"510"bgtz %[temp5], 1f \n\t"511" addiu %[i], %[i], -1 \n\t"512"sll %[temp4], %[temp8], 1 \n\t"513"shll_s.w %[temp5], %[temp6], 24 \n\t"514"addu %[temp3], %[temp4], %[temp8] \n\t"515"sra %[temp5], %[temp5], 24 \n\t"516"addu %[temp3], %[temp3], %[temp5] \n\t"517"addiu %[temp7], %[temp3], 3 \n\t"518"sra %[temp7], %[temp7], 3 \n\t"519"shra_r.w %[temp8], %[temp3], 3 \n\t"520"shll_s.w %[temp0], %[temp7], 27 \n\t"521"shll_s.w %[temp4], %[temp8], 27 \n\t"522"sra %[temp0], %[temp0], 27 \n\t"523"sra %[temp4], %[temp4], 27 \n\t"524"addu %[temp7], %[temp1], %[temp0] \n\t"525"subu %[temp2], %[temp2], %[temp4] \n\t"526"lbux %[temp3], %[temp7](%[VP8kclip1]) \n\t"527"lbux %[temp4], %[temp2](%[VP8kclip1]) \n\t"528"sb %[temp3], 0(%[p1]) \n\t"529"sb %[temp4], 0(%[p]) \n\t"530"1: \n\t"531"addiu %[p1], %[p1], 1 \n\t"532"bgtz %[i], 0b \n\t"533" addiu %[p], %[p], 1 \n\t"534" .set pop \n\t"535: [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),536[temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),537[temp6]"=&r"(temp6), [temp7]"=&r"(temp7), [temp8]"=&r"(temp8),538[p]"+&r"(p), [i]"=&r"(i), [p1]"+&r"(p1)539: [stride]"r"(stride), [VP8kclip1]"r"(VP8kclip1), [thresh2]"r"(thresh2)540: "memory"541);542}543544// TEMP0 = SRC[A + A1 * BPS]545// TEMP1 = SRC[B + B1 * BPS]546// TEMP2 = SRC[C + C1 * BPS]547// TEMP3 = SRC[D + D1 * BPS]548#define LOAD_4_BYTES(TEMP0, TEMP1, TEMP2, TEMP3, \549A, A1, B, B1, C, C1, D, D1, SRC) \550"lbu %[" #TEMP0 "], " #A "+" #A1 "*" XSTR(BPS) "(%[" #SRC "]) \n\t" \551"lbu %[" #TEMP1 "], " #B "+" #B1 "*" XSTR(BPS) "(%[" #SRC "]) \n\t" \552"lbu %[" #TEMP2 "], " #C "+" #C1 "*" XSTR(BPS) "(%[" #SRC "]) \n\t" \553"lbu %[" #TEMP3 "], " #D "+" #D1 "*" XSTR(BPS) "(%[" #SRC "]) \n\t" \554555static void SimpleHFilter16(uint8_t* p, int stride, int thresh) {556int i;557const int thresh2 = 2 * thresh + 1;558int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8;559__asm__ volatile (560".set push \n\t"561".set noreorder \n\t"562"li %[i], 16 \n\t"563"0: \n\t"564LOAD_4_BYTES(temp0, temp1, temp2, temp3, -2, 0, -1, 0, 0, 0, 1, 0, p)565"subu %[temp7], %[temp1], %[temp2] \n\t"566"subu %[temp6], %[temp0], %[temp3] \n\t"567"absq_s.w %[temp4], %[temp7] \n\t"568"absq_s.w %[temp5], %[temp6] \n\t"569"sll %[temp4], %[temp4], 2 \n\t"570"addu %[temp5], %[temp4], %[temp5] \n\t"571"subu %[temp5], %[temp5], %[thresh2] \n\t"572"negu %[temp8], %[temp7] \n\t"573"bgtz %[temp5], 1f \n\t"574" addiu %[i], %[i], -1 \n\t"575"sll %[temp4], %[temp8], 1 \n\t"576"shll_s.w %[temp5], %[temp6], 24 \n\t"577"addu %[temp3], %[temp4], %[temp8] \n\t"578"sra %[temp5], %[temp5], 24 \n\t"579"addu %[temp3], %[temp3], %[temp5] \n\t"580"addiu %[temp7], %[temp3], 3 \n\t"581"sra %[temp7], %[temp7], 3 \n\t"582"shra_r.w %[temp8], %[temp3], 3 \n\t"583"shll_s.w %[temp0], %[temp7], 27 \n\t"584"shll_s.w %[temp4], %[temp8], 27 \n\t"585"sra %[temp0], %[temp0], 27 \n\t"586"sra %[temp4], %[temp4], 27 \n\t"587"addu %[temp7], %[temp1], %[temp0] \n\t"588"subu %[temp2], %[temp2], %[temp4] \n\t"589"lbux %[temp3], %[temp7](%[VP8kclip1]) \n\t"590"lbux %[temp4], %[temp2](%[VP8kclip1]) \n\t"591"sb %[temp3], -1(%[p]) \n\t"592"sb %[temp4], 0(%[p]) \n\t"593"1: \n\t"594"bgtz %[i], 0b \n\t"595" addu %[p], %[p], %[stride] \n\t"596".set pop \n\t"597: [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),598[temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),599[temp6]"=&r"(temp6), [temp7]"=&r"(temp7), [temp8]"=&r"(temp8),600[p]"+&r"(p), [i]"=&r"(i)601: [stride]"r"(stride), [VP8kclip1]"r"(VP8kclip1), [thresh2]"r"(thresh2)602: "memory"603);604}605606static void SimpleVFilter16i(uint8_t* p, int stride, int thresh) {607int k;608for (k = 3; k > 0; --k) {609p += 4 * stride;610SimpleVFilter16(p, stride, thresh);611}612}613614static void SimpleHFilter16i(uint8_t* p, int stride, int thresh) {615int k;616for (k = 3; k > 0; --k) {617p += 4;618SimpleHFilter16(p, stride, thresh);619}620}621622// DST[A * BPS] = TEMP0623// DST[B + C * BPS] = TEMP1624#define STORE_8_BYTES(TEMP0, TEMP1, A, B, C, DST) \625"usw %[" #TEMP0 "], " #A "*" XSTR(BPS) "(%[" #DST "]) \n\t" \626"usw %[" #TEMP1 "], " #B "+" #C "*" XSTR(BPS) "(%[" #DST "]) \n\t"627628static void VE4(uint8_t* dst) { // vertical629const uint8_t* top = dst - BPS;630int temp0, temp1, temp2, temp3, temp4, temp5, temp6;631__asm__ volatile (632"ulw %[temp0], -1(%[top]) \n\t"633"ulh %[temp1], 3(%[top]) \n\t"634"preceu.ph.qbr %[temp2], %[temp0] \n\t"635"preceu.ph.qbl %[temp3], %[temp0] \n\t"636"preceu.ph.qbr %[temp4], %[temp1] \n\t"637"packrl.ph %[temp5], %[temp3], %[temp2] \n\t"638"packrl.ph %[temp6], %[temp4], %[temp3] \n\t"639"shll.ph %[temp5], %[temp5], 1 \n\t"640"shll.ph %[temp6], %[temp6], 1 \n\t"641"addq.ph %[temp2], %[temp5], %[temp2] \n\t"642"addq.ph %[temp6], %[temp6], %[temp4] \n\t"643"addq.ph %[temp2], %[temp2], %[temp3] \n\t"644"addq.ph %[temp6], %[temp6], %[temp3] \n\t"645"shra_r.ph %[temp2], %[temp2], 2 \n\t"646"shra_r.ph %[temp6], %[temp6], 2 \n\t"647"precr.qb.ph %[temp4], %[temp6], %[temp2] \n\t"648STORE_8_BYTES(temp4, temp4, 0, 0, 1, dst)649STORE_8_BYTES(temp4, temp4, 2, 0, 3, dst)650: [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),651[temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),652[temp6]"=&r"(temp6)653: [top]"r"(top), [dst]"r"(dst)654: "memory"655);656}657658static void DC4(uint8_t* dst) { // DC659int temp0, temp1, temp2, temp3, temp4;660__asm__ volatile (661"ulw %[temp0], -1*" XSTR(BPS) "(%[dst]) \n\t"662LOAD_4_BYTES(temp1, temp2, temp3, temp4, -1, 0, -1, 1, -1, 2, -1, 3, dst)663"ins %[temp1], %[temp2], 8, 8 \n\t"664"ins %[temp1], %[temp3], 16, 8 \n\t"665"ins %[temp1], %[temp4], 24, 8 \n\t"666"raddu.w.qb %[temp0], %[temp0] \n\t"667"raddu.w.qb %[temp1], %[temp1] \n\t"668"addu %[temp0], %[temp0], %[temp1] \n\t"669"shra_r.w %[temp0], %[temp0], 3 \n\t"670"replv.qb %[temp0], %[temp0] \n\t"671STORE_8_BYTES(temp0, temp0, 0, 0, 1, dst)672STORE_8_BYTES(temp0, temp0, 2, 0, 3, dst)673: [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),674[temp3]"=&r"(temp3), [temp4]"=&r"(temp4)675: [dst]"r"(dst)676: "memory"677);678}679680static void RD4(uint8_t* dst) { // Down-right681int temp0, temp1, temp2, temp3, temp4;682int temp5, temp6, temp7, temp8;683__asm__ volatile (684LOAD_4_BYTES(temp0, temp1, temp2, temp3, -1, 0, -1, 1, -1, 2, -1, 3, dst)685"ulw %[temp7], -1-" XSTR(BPS) "(%[dst]) \n\t"686"ins %[temp1], %[temp0], 16, 16 \n\t"687"preceu.ph.qbr %[temp5], %[temp7] \n\t"688"ins %[temp2], %[temp1], 16, 16 \n\t"689"preceu.ph.qbl %[temp4], %[temp7] \n\t"690"ins %[temp3], %[temp2], 16, 16 \n\t"691"shll.ph %[temp2], %[temp2], 1 \n\t"692"addq.ph %[temp3], %[temp3], %[temp1] \n\t"693"packrl.ph %[temp6], %[temp5], %[temp1] \n\t"694"addq.ph %[temp3], %[temp3], %[temp2] \n\t"695"addq.ph %[temp1], %[temp1], %[temp5] \n\t"696"shll.ph %[temp6], %[temp6], 1 \n\t"697"addq.ph %[temp1], %[temp1], %[temp6] \n\t"698"packrl.ph %[temp0], %[temp4], %[temp5] \n\t"699"addq.ph %[temp8], %[temp5], %[temp4] \n\t"700"shra_r.ph %[temp3], %[temp3], 2 \n\t"701"shll.ph %[temp0], %[temp0], 1 \n\t"702"shra_r.ph %[temp1], %[temp1], 2 \n\t"703"addq.ph %[temp8], %[temp0], %[temp8] \n\t"704"lbu %[temp5], 3-" XSTR(BPS) "(%[dst]) \n\t"705"precrq.ph.w %[temp7], %[temp7], %[temp7] \n\t"706"shra_r.ph %[temp8], %[temp8], 2 \n\t"707"ins %[temp7], %[temp5], 0, 8 \n\t"708"precr.qb.ph %[temp2], %[temp1], %[temp3] \n\t"709"raddu.w.qb %[temp4], %[temp7] \n\t"710"precr.qb.ph %[temp6], %[temp8], %[temp1] \n\t"711"shra_r.w %[temp4], %[temp4], 2 \n\t"712STORE_8_BYTES(temp2, temp6, 3, 0, 1, dst)713"prepend %[temp2], %[temp8], 8 \n\t"714"prepend %[temp6], %[temp4], 8 \n\t"715STORE_8_BYTES(temp2, temp6, 2, 0, 0, dst)716: [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),717[temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),718[temp6]"=&r"(temp6), [temp7]"=&r"(temp7), [temp8]"=&r"(temp8)719: [dst]"r"(dst)720: "memory"721);722}723724// TEMP0 = SRC[A * BPS]725// TEMP1 = SRC[B + C * BPS]726#define LOAD_8_BYTES(TEMP0, TEMP1, A, B, C, SRC) \727"ulw %[" #TEMP0 "], " #A "*" XSTR(BPS) "(%[" #SRC "]) \n\t" \728"ulw %[" #TEMP1 "], " #B "+" #C "*" XSTR(BPS) "(%[" #SRC "]) \n\t"729730static void LD4(uint8_t* dst) { // Down-Left731int temp0, temp1, temp2, temp3, temp4;732int temp5, temp6, temp7, temp8, temp9;733__asm__ volatile (734LOAD_8_BYTES(temp0, temp1, -1, 4, -1, dst)735"preceu.ph.qbl %[temp2], %[temp0] \n\t"736"preceu.ph.qbr %[temp3], %[temp0] \n\t"737"preceu.ph.qbr %[temp4], %[temp1] \n\t"738"preceu.ph.qbl %[temp5], %[temp1] \n\t"739"packrl.ph %[temp6], %[temp2], %[temp3] \n\t"740"packrl.ph %[temp7], %[temp4], %[temp2] \n\t"741"packrl.ph %[temp8], %[temp5], %[temp4] \n\t"742"shll.ph %[temp6], %[temp6], 1 \n\t"743"addq.ph %[temp9], %[temp2], %[temp6] \n\t"744"shll.ph %[temp7], %[temp7], 1 \n\t"745"addq.ph %[temp9], %[temp9], %[temp3] \n\t"746"shll.ph %[temp8], %[temp8], 1 \n\t"747"shra_r.ph %[temp9], %[temp9], 2 \n\t"748"addq.ph %[temp3], %[temp4], %[temp7] \n\t"749"addq.ph %[temp0], %[temp5], %[temp8] \n\t"750"addq.ph %[temp3], %[temp3], %[temp2] \n\t"751"addq.ph %[temp0], %[temp0], %[temp4] \n\t"752"shra_r.ph %[temp3], %[temp3], 2 \n\t"753"shra_r.ph %[temp0], %[temp0], 2 \n\t"754"srl %[temp1], %[temp1], 24 \n\t"755"sll %[temp1], %[temp1], 1 \n\t"756"raddu.w.qb %[temp5], %[temp5] \n\t"757"precr.qb.ph %[temp9], %[temp3], %[temp9] \n\t"758"precr.qb.ph %[temp3], %[temp0], %[temp3] \n\t"759"addu %[temp1], %[temp1], %[temp5] \n\t"760"shra_r.w %[temp1], %[temp1], 2 \n\t"761STORE_8_BYTES(temp9, temp3, 0, 0, 2, dst)762"prepend %[temp9], %[temp0], 8 \n\t"763"prepend %[temp3], %[temp1], 8 \n\t"764STORE_8_BYTES(temp9, temp3, 1, 0, 3, dst)765: [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),766[temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),767[temp6]"=&r"(temp6), [temp7]"=&r"(temp7), [temp8]"=&r"(temp8),768[temp9]"=&r"(temp9)769: [dst]"r"(dst)770: "memory"771);772}773774//------------------------------------------------------------------------------775// Chroma776777static void DC8uv(uint8_t* dst) { // DC778int temp0, temp1, temp2, temp3, temp4;779int temp5, temp6, temp7, temp8, temp9;780__asm__ volatile (781LOAD_8_BYTES(temp0, temp1, -1, 4, -1, dst)782LOAD_4_BYTES(temp2, temp3, temp4, temp5, -1, 0, -1, 1, -1, 2, -1, 3, dst)783LOAD_4_BYTES(temp6, temp7, temp8, temp9, -1, 4, -1, 5, -1, 6, -1, 7, dst)784"raddu.w.qb %[temp0], %[temp0] \n\t"785"raddu.w.qb %[temp1], %[temp1] \n\t"786"addu %[temp2], %[temp2], %[temp3] \n\t"787"addu %[temp4], %[temp4], %[temp5] \n\t"788"addu %[temp6], %[temp6], %[temp7] \n\t"789"addu %[temp8], %[temp8], %[temp9] \n\t"790"addu %[temp0], %[temp0], %[temp1] \n\t"791"addu %[temp2], %[temp2], %[temp4] \n\t"792"addu %[temp6], %[temp6], %[temp8] \n\t"793"addu %[temp0], %[temp0], %[temp2] \n\t"794"addu %[temp0], %[temp0], %[temp6] \n\t"795"shra_r.w %[temp0], %[temp0], 4 \n\t"796"replv.qb %[temp0], %[temp0] \n\t"797STORE_8_BYTES(temp0, temp0, 0, 4, 0, dst)798STORE_8_BYTES(temp0, temp0, 1, 4, 1, dst)799STORE_8_BYTES(temp0, temp0, 2, 4, 2, dst)800STORE_8_BYTES(temp0, temp0, 3, 4, 3, dst)801STORE_8_BYTES(temp0, temp0, 4, 4, 4, dst)802STORE_8_BYTES(temp0, temp0, 5, 4, 5, dst)803STORE_8_BYTES(temp0, temp0, 6, 4, 6, dst)804STORE_8_BYTES(temp0, temp0, 7, 4, 7, dst)805: [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),806[temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),807[temp6]"=&r"(temp6), [temp7]"=&r"(temp7), [temp8]"=&r"(temp8),808[temp9]"=&r"(temp9)809: [dst]"r"(dst)810: "memory"811);812}813814static void DC8uvNoLeft(uint8_t* dst) { // DC with no left samples815int temp0, temp1;816__asm__ volatile (817LOAD_8_BYTES(temp0, temp1, -1, 4, -1, dst)818"raddu.w.qb %[temp0], %[temp0] \n\t"819"raddu.w.qb %[temp1], %[temp1] \n\t"820"addu %[temp0], %[temp0], %[temp1] \n\t"821"shra_r.w %[temp0], %[temp0], 3 \n\t"822"replv.qb %[temp0], %[temp0] \n\t"823STORE_8_BYTES(temp0, temp0, 0, 4, 0, dst)824STORE_8_BYTES(temp0, temp0, 1, 4, 1, dst)825STORE_8_BYTES(temp0, temp0, 2, 4, 2, dst)826STORE_8_BYTES(temp0, temp0, 3, 4, 3, dst)827STORE_8_BYTES(temp0, temp0, 4, 4, 4, dst)828STORE_8_BYTES(temp0, temp0, 5, 4, 5, dst)829STORE_8_BYTES(temp0, temp0, 6, 4, 6, dst)830STORE_8_BYTES(temp0, temp0, 7, 4, 7, dst)831: [temp0]"=&r"(temp0), [temp1]"=&r"(temp1)832: [dst]"r"(dst)833: "memory"834);835}836837static void DC8uvNoTop(uint8_t* dst) { // DC with no top samples838int temp0, temp1, temp2, temp3, temp4;839int temp5, temp6, temp7, temp8;840__asm__ volatile (841LOAD_4_BYTES(temp2, temp3, temp4, temp5, -1, 0, -1, 1, -1, 2, -1, 3, dst)842LOAD_4_BYTES(temp6, temp7, temp8, temp1, -1, 4, -1, 5, -1, 6, -1, 7, dst)843"addu %[temp2], %[temp2], %[temp3] \n\t"844"addu %[temp4], %[temp4], %[temp5] \n\t"845"addu %[temp6], %[temp6], %[temp7] \n\t"846"addu %[temp8], %[temp8], %[temp1] \n\t"847"addu %[temp2], %[temp2], %[temp4] \n\t"848"addu %[temp6], %[temp6], %[temp8] \n\t"849"addu %[temp0], %[temp6], %[temp2] \n\t"850"shra_r.w %[temp0], %[temp0], 3 \n\t"851"replv.qb %[temp0], %[temp0] \n\t"852STORE_8_BYTES(temp0, temp0, 0, 4, 0, dst)853STORE_8_BYTES(temp0, temp0, 1, 4, 1, dst)854STORE_8_BYTES(temp0, temp0, 2, 4, 2, dst)855STORE_8_BYTES(temp0, temp0, 3, 4, 3, dst)856STORE_8_BYTES(temp0, temp0, 4, 4, 4, dst)857STORE_8_BYTES(temp0, temp0, 5, 4, 5, dst)858STORE_8_BYTES(temp0, temp0, 6, 4, 6, dst)859STORE_8_BYTES(temp0, temp0, 7, 4, 7, dst)860: [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),861[temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),862[temp6]"=&r"(temp6), [temp7]"=&r"(temp7), [temp8]"=&r"(temp8)863: [dst]"r"(dst)864: "memory"865);866}867868#undef LOAD_8_BYTES869#undef STORE_8_BYTES870#undef LOAD_4_BYTES871872#define CLIPPING(SIZE) \873"preceu.ph.qbl %[temp2], %[temp0] \n\t" \874"preceu.ph.qbr %[temp0], %[temp0] \n\t" \875".if " #SIZE " == 8 \n\t" \876"preceu.ph.qbl %[temp3], %[temp1] \n\t" \877"preceu.ph.qbr %[temp1], %[temp1] \n\t" \878".endif \n\t" \879"addu.ph %[temp2], %[temp2], %[dst_1] \n\t" \880"addu.ph %[temp0], %[temp0], %[dst_1] \n\t" \881".if " #SIZE " == 8 \n\t" \882"addu.ph %[temp3], %[temp3], %[dst_1] \n\t" \883"addu.ph %[temp1], %[temp1], %[dst_1] \n\t" \884".endif \n\t" \885"shll_s.ph %[temp2], %[temp2], 7 \n\t" \886"shll_s.ph %[temp0], %[temp0], 7 \n\t" \887".if " #SIZE " == 8 \n\t" \888"shll_s.ph %[temp3], %[temp3], 7 \n\t" \889"shll_s.ph %[temp1], %[temp1], 7 \n\t" \890".endif \n\t" \891"precrqu_s.qb.ph %[temp0], %[temp2], %[temp0] \n\t" \892".if " #SIZE " == 8 \n\t" \893"precrqu_s.qb.ph %[temp1], %[temp3], %[temp1] \n\t" \894".endif \n\t"895896897#define CLIP_8B_TO_DST(DST, TOP, SIZE) do { \898int dst_1 = ((int)(DST)[-1] << 16) + (DST)[-1]; \899int temp0, temp1, temp2, temp3; \900__asm__ volatile ( \901".if " #SIZE " < 8 \n\t" \902"ulw %[temp0], 0(%[top]) \n\t" \903"subu.ph %[dst_1], %[dst_1], %[top_1] \n\t" \904CLIPPING(4) \905"usw %[temp0], 0(%[dst]) \n\t" \906".else \n\t" \907"ulw %[temp0], 0(%[top]) \n\t" \908"ulw %[temp1], 4(%[top]) \n\t" \909"subu.ph %[dst_1], %[dst_1], %[top_1] \n\t" \910CLIPPING(8) \911"usw %[temp0], 0(%[dst]) \n\t" \912"usw %[temp1], 4(%[dst]) \n\t" \913".if " #SIZE " == 16 \n\t" \914"ulw %[temp0], 8(%[top]) \n\t" \915"ulw %[temp1], 12(%[top]) \n\t" \916CLIPPING(8) \917"usw %[temp0], 8(%[dst]) \n\t" \918"usw %[temp1], 12(%[dst]) \n\t" \919".endif \n\t" \920".endif \n\t" \921: [dst_1]"+&r"(dst_1), [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), \922[temp2]"=&r"(temp2), [temp3]"=&r"(temp3) \923: [top_1]"r"(top_1), [top]"r"((TOP)), [dst]"r"((DST)) \924: "memory" \925); \926} while (0)927928#define CLIP_TO_DST(DST, SIZE) do { \929int y; \930const uint8_t* top = (DST) - BPS; \931const int top_1 = ((int)top[-1] << 16) + top[-1]; \932for (y = 0; y < (SIZE); ++y) { \933CLIP_8B_TO_DST((DST), top, (SIZE)); \934(DST) += BPS; \935} \936} while (0)937938#define TRUE_MOTION(DST, SIZE) \939static void TrueMotion##SIZE(uint8_t* (DST)) { \940CLIP_TO_DST((DST), (SIZE)); \941}942943TRUE_MOTION(dst, 4)944TRUE_MOTION(dst, 8)945TRUE_MOTION(dst, 16)946947#undef TRUE_MOTION948#undef CLIP_TO_DST949#undef CLIP_8B_TO_DST950#undef CLIPPING951952//------------------------------------------------------------------------------953// Entry point954955extern void VP8DspInitMIPSdspR2(void);956957WEBP_TSAN_IGNORE_FUNCTION void VP8DspInitMIPSdspR2(void) {958VP8TransformDC = TransformDC;959VP8TransformAC3 = TransformAC3;960VP8Transform = TransformTwo;961962VP8VFilter16 = VFilter16;963VP8HFilter16 = HFilter16;964VP8VFilter8 = VFilter8;965VP8HFilter8 = HFilter8;966VP8VFilter16i = VFilter16i;967VP8HFilter16i = HFilter16i;968VP8VFilter8i = VFilter8i;969VP8HFilter8i = HFilter8i;970VP8SimpleVFilter16 = SimpleVFilter16;971VP8SimpleHFilter16 = SimpleHFilter16;972VP8SimpleVFilter16i = SimpleVFilter16i;973VP8SimpleHFilter16i = SimpleHFilter16i;974975VP8PredLuma4[0] = DC4;976VP8PredLuma4[1] = TrueMotion4;977VP8PredLuma4[2] = VE4;978VP8PredLuma4[4] = RD4;979VP8PredLuma4[6] = LD4;980981VP8PredChroma8[0] = DC8uv;982VP8PredChroma8[1] = TrueMotion8;983VP8PredChroma8[4] = DC8uvNoTop;984VP8PredChroma8[5] = DC8uvNoLeft;985986VP8PredLuma16[1] = TrueMotion16;987}988989#else // !WEBP_USE_MIPS_DSP_R2990991WEBP_DSP_INIT_STUB(VP8DspInitMIPSdspR2)992993#endif // WEBP_USE_MIPS_DSP_R2994995996