Path: blob/master/thirdparty/libwebp/src/dsp/lossless_enc_mips32.c
9913 views
// Copyright 2015 Google Inc. All Rights Reserved.1//2// Use of this source code is governed by a BSD-style license3// that can be found in the COPYING file in the root of the source4// tree. An additional intellectual property rights grant can be found5// in the file PATENTS. All contributing project authors may6// be found in the AUTHORS file in the root of the source tree.7// -----------------------------------------------------------------------------8//9// MIPS version of lossless functions10//11// Author(s): Djordje Pesut ([email protected])12// Jovan Zelincevic ([email protected])1314#include "src/dsp/dsp.h"15#include "src/dsp/lossless.h"16#include "src/dsp/lossless_common.h"1718#if defined(WEBP_USE_MIPS32)1920#include <assert.h>21#include <math.h>22#include <stdlib.h>23#include <string.h>2425static uint64_t FastSLog2Slow_MIPS32(uint32_t v) {26assert(v >= LOG_LOOKUP_IDX_MAX);27if (v < APPROX_LOG_WITH_CORRECTION_MAX) {28uint32_t log_cnt, y;29uint64_t correction;30const int c24 = 24;31uint32_t temp;3233// Xf = 256 = 2^834// log_cnt is index of leading one in upper 24 bits35__asm__ volatile(36"clz %[log_cnt], %[v] \n\t"37"addiu %[y], $zero, 1 \n\t"38"subu %[log_cnt], %[c24], %[log_cnt] \n\t"39"sllv %[y], %[y], %[log_cnt] \n\t"40"srlv %[temp], %[v], %[log_cnt] \n\t"41: [log_cnt]"=&r"(log_cnt), [y]"=&r"(y),42[temp]"=r"(temp)43: [c24]"r"(c24), [v]"r"(v)44);4546// vf = (2^log_cnt) * Xf; where y = 2^log_cnt and Xf < 25647// Xf = floor(Xf) * (1 + (v % y) / v)48// log2(Xf) = log2(floor(Xf)) + log2(1 + (v % y) / v)49// The correction factor: log(1 + d) ~ d; for very small d values, so50// log2(1 + (v % y) / v) ~ LOG_2_RECIPROCAL * (v % y)/v5152// (v % y) = (v % 2^log_cnt) = v & (2^log_cnt - 1)53correction = LOG_2_RECIPROCAL_FIXED * (v & (y - 1));54return (uint64_t)v * (kLog2Table[temp] +55((uint64_t)log_cnt << LOG_2_PRECISION_BITS)) +56correction;57} else {58return (uint64_t)(LOG_2_RECIPROCAL_FIXED_DOUBLE * v * log((double)v) + .5);59}60}6162static uint32_t FastLog2Slow_MIPS32(uint32_t v) {63assert(v >= LOG_LOOKUP_IDX_MAX);64if (v < APPROX_LOG_WITH_CORRECTION_MAX) {65uint32_t log_cnt, y;66const int c24 = 24;67uint32_t log_2;68uint32_t temp;6970__asm__ volatile(71"clz %[log_cnt], %[v] \n\t"72"addiu %[y], $zero, 1 \n\t"73"subu %[log_cnt], %[c24], %[log_cnt] \n\t"74"sllv %[y], %[y], %[log_cnt] \n\t"75"srlv %[temp], %[v], %[log_cnt] \n\t"76: [log_cnt]"=&r"(log_cnt), [y]"=&r"(y),77[temp]"=r"(temp)78: [c24]"r"(c24), [v]"r"(v)79);8081log_2 = kLog2Table[temp] + (log_cnt << LOG_2_PRECISION_BITS);82if (v >= APPROX_LOG_MAX) {83// Since the division is still expensive, add this correction factor only84// for large values of 'v'.85const uint64_t correction = LOG_2_RECIPROCAL_FIXED * (v & (y - 1));86log_2 += (uint32_t)DivRound(correction, v);87}88return log_2;89} else {90return (uint32_t)(LOG_2_RECIPROCAL_FIXED_DOUBLE * log((double)v) + .5);91}92}9394// C version of this function:95// int i = 0;96// int64_t cost = 0;97// const uint32_t* pop = &population[4];98// const uint32_t* LoopEnd = &population[length];99// while (pop != LoopEnd) {100// ++i;101// cost += i * *pop;102// cost += i * *(pop + 1);103// pop += 2;104// }105// return cost;106static uint32_t ExtraCost_MIPS32(const uint32_t* const population, int length) {107int i, temp0, temp1;108const uint32_t* pop = &population[4];109const uint32_t* const LoopEnd = &population[length];110111__asm__ volatile(112"mult $zero, $zero \n\t"113"xor %[i], %[i], %[i] \n\t"114"beq %[pop], %[LoopEnd], 2f \n\t"115"1: \n\t"116"lw %[temp0], 0(%[pop]) \n\t"117"lw %[temp1], 4(%[pop]) \n\t"118"addiu %[i], %[i], 1 \n\t"119"addiu %[pop], %[pop], 8 \n\t"120"madd %[i], %[temp0] \n\t"121"madd %[i], %[temp1] \n\t"122"bne %[pop], %[LoopEnd], 1b \n\t"123"2: \n\t"124"mfhi %[temp0] \n\t"125"mflo %[temp1] \n\t"126: [temp0]"=&r"(temp0), [temp1]"=&r"(temp1),127[i]"=&r"(i), [pop]"+r"(pop)128: [LoopEnd]"r"(LoopEnd)129: "memory", "hi", "lo"130);131132return ((int64_t)temp0 << 32 | temp1);133}134135// C version of this function:136// int i = 0;137// int64_t cost = 0;138// const uint32_t* pX = &X[4];139// const uint32_t* pY = &Y[4];140// const uint32_t* LoopEnd = &X[length];141// while (pX != LoopEnd) {142// const uint32_t xy0 = *pX + *pY;143// const uint32_t xy1 = *(pX + 1) + *(pY + 1);144// ++i;145// cost += i * xy0;146// cost += i * xy1;147// pX += 2;148// pY += 2;149// }150// return cost;151static uint32_t ExtraCostCombined_MIPS32(const uint32_t* WEBP_RESTRICT const X,152const uint32_t* WEBP_RESTRICT const Y,153int length) {154int i, temp0, temp1, temp2, temp3;155const uint32_t* pX = &X[4];156const uint32_t* pY = &Y[4];157const uint32_t* const LoopEnd = &X[length];158159__asm__ volatile(160"mult $zero, $zero \n\t"161"xor %[i], %[i], %[i] \n\t"162"beq %[pX], %[LoopEnd], 2f \n\t"163"1: \n\t"164"lw %[temp0], 0(%[pX]) \n\t"165"lw %[temp1], 0(%[pY]) \n\t"166"lw %[temp2], 4(%[pX]) \n\t"167"lw %[temp3], 4(%[pY]) \n\t"168"addiu %[i], %[i], 1 \n\t"169"addu %[temp0], %[temp0], %[temp1] \n\t"170"addu %[temp2], %[temp2], %[temp3] \n\t"171"addiu %[pX], %[pX], 8 \n\t"172"addiu %[pY], %[pY], 8 \n\t"173"madd %[i], %[temp0] \n\t"174"madd %[i], %[temp2] \n\t"175"bne %[pX], %[LoopEnd], 1b \n\t"176"2: \n\t"177"mfhi %[temp0] \n\t"178"mflo %[temp1] \n\t"179: [temp0]"=&r"(temp0), [temp1]"=&r"(temp1),180[temp2]"=&r"(temp2), [temp3]"=&r"(temp3),181[i]"=&r"(i), [pX]"+r"(pX), [pY]"+r"(pY)182: [LoopEnd]"r"(LoopEnd)183: "memory", "hi", "lo"184);185186return ((int64_t)temp0 << 32 | temp1);187}188189#define HUFFMAN_COST_PASS \190__asm__ volatile( \191"sll %[temp1], %[temp0], 3 \n\t" \192"addiu %[temp3], %[streak], -3 \n\t" \193"addu %[temp2], %[pstreaks], %[temp1] \n\t" \194"blez %[temp3], 1f \n\t" \195"srl %[temp1], %[temp1], 1 \n\t" \196"addu %[temp3], %[pcnts], %[temp1] \n\t" \197"lw %[temp0], 4(%[temp2]) \n\t" \198"lw %[temp1], 0(%[temp3]) \n\t" \199"addu %[temp0], %[temp0], %[streak] \n\t" \200"addiu %[temp1], %[temp1], 1 \n\t" \201"sw %[temp0], 4(%[temp2]) \n\t" \202"sw %[temp1], 0(%[temp3]) \n\t" \203"b 2f \n\t" \204"1: \n\t" \205"lw %[temp0], 0(%[temp2]) \n\t" \206"addu %[temp0], %[temp0], %[streak] \n\t" \207"sw %[temp0], 0(%[temp2]) \n\t" \208"2: \n\t" \209: [temp1]"=&r"(temp1), [temp2]"=&r"(temp2), \210[temp3]"=&r"(temp3), [temp0]"+r"(temp0) \211: [pstreaks]"r"(pstreaks), [pcnts]"r"(pcnts), \212[streak]"r"(streak) \213: "memory" \214);215216// Returns the various RLE counts217static WEBP_INLINE void GetEntropyUnrefinedHelper(218uint32_t val, int i, uint32_t* WEBP_RESTRICT const val_prev,219int* WEBP_RESTRICT const i_prev,220VP8LBitEntropy* WEBP_RESTRICT const bit_entropy,221VP8LStreaks* WEBP_RESTRICT const stats) {222int* const pstreaks = &stats->streaks[0][0];223int* const pcnts = &stats->counts[0];224int temp0, temp1, temp2, temp3;225const int streak = i - *i_prev;226227// Gather info for the bit entropy.228if (*val_prev != 0) {229bit_entropy->sum += (*val_prev) * streak;230bit_entropy->nonzeros += streak;231bit_entropy->nonzero_code = *i_prev;232bit_entropy->entropy += VP8LFastSLog2(*val_prev) * streak;233if (bit_entropy->max_val < *val_prev) {234bit_entropy->max_val = *val_prev;235}236}237238// Gather info for the Huffman cost.239temp0 = (*val_prev != 0);240HUFFMAN_COST_PASS241242*val_prev = val;243*i_prev = i;244}245246static void GetEntropyUnrefined_MIPS32(247const uint32_t X[], int length,248VP8LBitEntropy* WEBP_RESTRICT const bit_entropy,249VP8LStreaks* WEBP_RESTRICT const stats) {250int i;251int i_prev = 0;252uint32_t x_prev = X[0];253254memset(stats, 0, sizeof(*stats));255VP8LBitEntropyInit(bit_entropy);256257for (i = 1; i < length; ++i) {258const uint32_t x = X[i];259if (x != x_prev) {260GetEntropyUnrefinedHelper(x, i, &x_prev, &i_prev, bit_entropy, stats);261}262}263GetEntropyUnrefinedHelper(0, i, &x_prev, &i_prev, bit_entropy, stats);264265bit_entropy->entropy = VP8LFastSLog2(bit_entropy->sum) - bit_entropy->entropy;266}267268static void GetCombinedEntropyUnrefined_MIPS32(269const uint32_t X[], const uint32_t Y[], int length,270VP8LBitEntropy* WEBP_RESTRICT const entropy,271VP8LStreaks* WEBP_RESTRICT const stats) {272int i = 1;273int i_prev = 0;274uint32_t xy_prev = X[0] + Y[0];275276memset(stats, 0, sizeof(*stats));277VP8LBitEntropyInit(entropy);278279for (i = 1; i < length; ++i) {280const uint32_t xy = X[i] + Y[i];281if (xy != xy_prev) {282GetEntropyUnrefinedHelper(xy, i, &xy_prev, &i_prev, entropy, stats);283}284}285GetEntropyUnrefinedHelper(0, i, &xy_prev, &i_prev, entropy, stats);286287entropy->entropy = VP8LFastSLog2(entropy->sum) - entropy->entropy;288}289290#define ASM_START \291__asm__ volatile( \292".set push \n\t" \293".set at \n\t" \294".set macro \n\t" \295"1: \n\t"296297// P2 = P0 + P1298// A..D - offsets299// E - temp variable to tell macro300// if pointer should be incremented301// literal_ and successive histograms could be unaligned302// so we must use ulw and usw303#define ADD_TO_OUT(A, B, C, D, E, P0, P1, P2) \304"ulw %[temp0], " #A "(%[" #P0 "]) \n\t" \305"ulw %[temp1], " #B "(%[" #P0 "]) \n\t" \306"ulw %[temp2], " #C "(%[" #P0 "]) \n\t" \307"ulw %[temp3], " #D "(%[" #P0 "]) \n\t" \308"ulw %[temp4], " #A "(%[" #P1 "]) \n\t" \309"ulw %[temp5], " #B "(%[" #P1 "]) \n\t" \310"ulw %[temp6], " #C "(%[" #P1 "]) \n\t" \311"ulw %[temp7], " #D "(%[" #P1 "]) \n\t" \312"addu %[temp4], %[temp4], %[temp0] \n\t" \313"addu %[temp5], %[temp5], %[temp1] \n\t" \314"addu %[temp6], %[temp6], %[temp2] \n\t" \315"addu %[temp7], %[temp7], %[temp3] \n\t" \316"addiu %[" #P0 "], %[" #P0 "], 16 \n\t" \317".if " #E " == 1 \n\t" \318"addiu %[" #P1 "], %[" #P1 "], 16 \n\t" \319".endif \n\t" \320"usw %[temp4], " #A "(%[" #P2 "]) \n\t" \321"usw %[temp5], " #B "(%[" #P2 "]) \n\t" \322"usw %[temp6], " #C "(%[" #P2 "]) \n\t" \323"usw %[temp7], " #D "(%[" #P2 "]) \n\t" \324"addiu %[" #P2 "], %[" #P2 "], 16 \n\t" \325"bne %[" #P0 "], %[LoopEnd], 1b \n\t" \326".set pop \n\t" \327328#define ASM_END_COMMON_0 \329: [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), \330[temp2]"=&r"(temp2), [temp3]"=&r"(temp3), \331[temp4]"=&r"(temp4), [temp5]"=&r"(temp5), \332[temp6]"=&r"(temp6), [temp7]"=&r"(temp7), \333[pa]"+r"(pa), [pout]"+r"(pout)334335#define ASM_END_COMMON_1 \336: [LoopEnd]"r"(LoopEnd) \337: "memory", "at" \338);339340#define ASM_END_0 \341ASM_END_COMMON_0 \342, [pb]"+r"(pb) \343ASM_END_COMMON_1344345#define ASM_END_1 \346ASM_END_COMMON_0 \347ASM_END_COMMON_1348349static void AddVector_MIPS32(const uint32_t* WEBP_RESTRICT pa,350const uint32_t* WEBP_RESTRICT pb,351uint32_t* WEBP_RESTRICT pout, int size) {352uint32_t temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;353const int end = ((size) / 4) * 4;354const uint32_t* const LoopEnd = pa + end;355int i;356ASM_START357ADD_TO_OUT(0, 4, 8, 12, 1, pa, pb, pout)358ASM_END_0359for (i = 0; i < size - end; ++i) pout[i] = pa[i] + pb[i];360}361362static void AddVectorEq_MIPS32(const uint32_t* WEBP_RESTRICT pa,363uint32_t* WEBP_RESTRICT pout, int size) {364uint32_t temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;365const int end = ((size) / 4) * 4;366const uint32_t* const LoopEnd = pa + end;367int i;368ASM_START369ADD_TO_OUT(0, 4, 8, 12, 0, pa, pout, pout)370ASM_END_1371for (i = 0; i < size - end; ++i) pout[i] += pa[i];372}373374#undef ASM_END_1375#undef ASM_END_0376#undef ASM_END_COMMON_1377#undef ASM_END_COMMON_0378#undef ADD_TO_OUT379#undef ASM_START380381//------------------------------------------------------------------------------382// Entry point383384extern void VP8LEncDspInitMIPS32(void);385386WEBP_TSAN_IGNORE_FUNCTION void VP8LEncDspInitMIPS32(void) {387VP8LFastSLog2Slow = FastSLog2Slow_MIPS32;388VP8LFastLog2Slow = FastLog2Slow_MIPS32;389VP8LExtraCost = ExtraCost_MIPS32;390VP8LExtraCostCombined = ExtraCostCombined_MIPS32;391VP8LGetEntropyUnrefined = GetEntropyUnrefined_MIPS32;392VP8LGetCombinedEntropyUnrefined = GetCombinedEntropyUnrefined_MIPS32;393VP8LAddVector = AddVector_MIPS32;394VP8LAddVectorEq = AddVectorEq_MIPS32;395}396397#else // !WEBP_USE_MIPS32398399WEBP_DSP_INIT_STUB(VP8LEncDspInitMIPS32)400401#endif // WEBP_USE_MIPS32402403404