Path: blob/master/3rdparty/libwebp/src/dsp/lossless_enc_mips32.c
16348 views
// Copyright 2015 Google Inc. All Rights Reserved.1//2// Use of this source code is governed by a BSD-style license3// that can be found in the COPYING file in the root of the source4// tree. An additional intellectual property rights grant can be found5// in the file PATENTS. All contributing project authors may6// be found in the AUTHORS file in the root of the source tree.7// -----------------------------------------------------------------------------8//9// MIPS version of lossless functions10//11// Author(s): Djordje Pesut ([email protected])12// Jovan Zelincevic ([email protected])1314#include "src/dsp/dsp.h"15#include "src/dsp/lossless.h"16#include "src/dsp/lossless_common.h"1718#if defined(WEBP_USE_MIPS32)1920#include <assert.h>21#include <math.h>22#include <stdlib.h>23#include <string.h>2425static float FastSLog2Slow_MIPS32(uint32_t v) {26assert(v >= LOG_LOOKUP_IDX_MAX);27if (v < APPROX_LOG_WITH_CORRECTION_MAX) {28uint32_t log_cnt, y, correction;29const int c24 = 24;30const float v_f = (float)v;31uint32_t temp;3233// Xf = 256 = 2^834// log_cnt is index of leading one in upper 24 bits35__asm__ volatile(36"clz %[log_cnt], %[v] \n\t"37"addiu %[y], $zero, 1 \n\t"38"subu %[log_cnt], %[c24], %[log_cnt] \n\t"39"sllv %[y], %[y], %[log_cnt] \n\t"40"srlv %[temp], %[v], %[log_cnt] \n\t"41: [log_cnt]"=&r"(log_cnt), [y]"=&r"(y),42[temp]"=r"(temp)43: [c24]"r"(c24), [v]"r"(v)44);4546// vf = (2^log_cnt) * Xf; where y = 2^log_cnt and Xf < 25647// Xf = floor(Xf) * (1 + (v % y) / v)48// log2(Xf) = log2(floor(Xf)) + log2(1 + (v % y) / v)49// The correction factor: log(1 + d) ~ d; for very small d values, so50// log2(1 + (v % y) / v) ~ LOG_2_RECIPROCAL * (v % y)/v51// LOG_2_RECIPROCAL ~ 23/165253// (v % y) = (v % 2^log_cnt) = v & (2^log_cnt - 1)54correction = (23 * (v & (y - 1))) >> 4;55return v_f * (kLog2Table[temp] + log_cnt) + correction;56} else {57return (float)(LOG_2_RECIPROCAL * v * log((double)v));58}59}6061static float FastLog2Slow_MIPS32(uint32_t v) {62assert(v >= LOG_LOOKUP_IDX_MAX);63if (v < APPROX_LOG_WITH_CORRECTION_MAX) {64uint32_t log_cnt, y;65const int c24 = 24;66double log_2;67uint32_t temp;6869__asm__ volatile(70"clz %[log_cnt], %[v] \n\t"71"addiu %[y], $zero, 1 \n\t"72"subu %[log_cnt], %[c24], %[log_cnt] \n\t"73"sllv %[y], %[y], %[log_cnt] \n\t"74"srlv %[temp], %[v], %[log_cnt] \n\t"75: [log_cnt]"=&r"(log_cnt), [y]"=&r"(y),76[temp]"=r"(temp)77: [c24]"r"(c24), [v]"r"(v)78);7980log_2 = kLog2Table[temp] + log_cnt;81if (v >= APPROX_LOG_MAX) {82// Since the division is still expensive, add this correction factor only83// for large values of 'v'.8485const uint32_t correction = (23 * (v & (y - 1))) >> 4;86log_2 += (double)correction / v;87}88return (float)log_2;89} else {90return (float)(LOG_2_RECIPROCAL * log((double)v));91}92}9394// C version of this function:95// int i = 0;96// int64_t cost = 0;97// const uint32_t* pop = &population[4];98// const uint32_t* LoopEnd = &population[length];99// while (pop != LoopEnd) {100// ++i;101// cost += i * *pop;102// cost += i * *(pop + 1);103// pop += 2;104// }105// return (double)cost;106static double ExtraCost_MIPS32(const uint32_t* const population, int length) {107int i, temp0, temp1;108const uint32_t* pop = &population[4];109const uint32_t* const LoopEnd = &population[length];110111__asm__ volatile(112"mult $zero, $zero \n\t"113"xor %[i], %[i], %[i] \n\t"114"beq %[pop], %[LoopEnd], 2f \n\t"115"1: \n\t"116"lw %[temp0], 0(%[pop]) \n\t"117"lw %[temp1], 4(%[pop]) \n\t"118"addiu %[i], %[i], 1 \n\t"119"addiu %[pop], %[pop], 8 \n\t"120"madd %[i], %[temp0] \n\t"121"madd %[i], %[temp1] \n\t"122"bne %[pop], %[LoopEnd], 1b \n\t"123"2: \n\t"124"mfhi %[temp0] \n\t"125"mflo %[temp1] \n\t"126: [temp0]"=&r"(temp0), [temp1]"=&r"(temp1),127[i]"=&r"(i), [pop]"+r"(pop)128: [LoopEnd]"r"(LoopEnd)129: "memory", "hi", "lo"130);131132return (double)((int64_t)temp0 << 32 | temp1);133}134135// C version of this function:136// int i = 0;137// int64_t cost = 0;138// const uint32_t* pX = &X[4];139// const uint32_t* pY = &Y[4];140// const uint32_t* LoopEnd = &X[length];141// while (pX != LoopEnd) {142// const uint32_t xy0 = *pX + *pY;143// const uint32_t xy1 = *(pX + 1) + *(pY + 1);144// ++i;145// cost += i * xy0;146// cost += i * xy1;147// pX += 2;148// pY += 2;149// }150// return (double)cost;151static double ExtraCostCombined_MIPS32(const uint32_t* const X,152const uint32_t* const Y, int length) {153int i, temp0, temp1, temp2, temp3;154const uint32_t* pX = &X[4];155const uint32_t* pY = &Y[4];156const uint32_t* const LoopEnd = &X[length];157158__asm__ volatile(159"mult $zero, $zero \n\t"160"xor %[i], %[i], %[i] \n\t"161"beq %[pX], %[LoopEnd], 2f \n\t"162"1: \n\t"163"lw %[temp0], 0(%[pX]) \n\t"164"lw %[temp1], 0(%[pY]) \n\t"165"lw %[temp2], 4(%[pX]) \n\t"166"lw %[temp3], 4(%[pY]) \n\t"167"addiu %[i], %[i], 1 \n\t"168"addu %[temp0], %[temp0], %[temp1] \n\t"169"addu %[temp2], %[temp2], %[temp3] \n\t"170"addiu %[pX], %[pX], 8 \n\t"171"addiu %[pY], %[pY], 8 \n\t"172"madd %[i], %[temp0] \n\t"173"madd %[i], %[temp2] \n\t"174"bne %[pX], %[LoopEnd], 1b \n\t"175"2: \n\t"176"mfhi %[temp0] \n\t"177"mflo %[temp1] \n\t"178: [temp0]"=&r"(temp0), [temp1]"=&r"(temp1),179[temp2]"=&r"(temp2), [temp3]"=&r"(temp3),180[i]"=&r"(i), [pX]"+r"(pX), [pY]"+r"(pY)181: [LoopEnd]"r"(LoopEnd)182: "memory", "hi", "lo"183);184185return (double)((int64_t)temp0 << 32 | temp1);186}187188#define HUFFMAN_COST_PASS \189__asm__ volatile( \190"sll %[temp1], %[temp0], 3 \n\t" \191"addiu %[temp3], %[streak], -3 \n\t" \192"addu %[temp2], %[pstreaks], %[temp1] \n\t" \193"blez %[temp3], 1f \n\t" \194"srl %[temp1], %[temp1], 1 \n\t" \195"addu %[temp3], %[pcnts], %[temp1] \n\t" \196"lw %[temp0], 4(%[temp2]) \n\t" \197"lw %[temp1], 0(%[temp3]) \n\t" \198"addu %[temp0], %[temp0], %[streak] \n\t" \199"addiu %[temp1], %[temp1], 1 \n\t" \200"sw %[temp0], 4(%[temp2]) \n\t" \201"sw %[temp1], 0(%[temp3]) \n\t" \202"b 2f \n\t" \203"1: \n\t" \204"lw %[temp0], 0(%[temp2]) \n\t" \205"addu %[temp0], %[temp0], %[streak] \n\t" \206"sw %[temp0], 0(%[temp2]) \n\t" \207"2: \n\t" \208: [temp1]"=&r"(temp1), [temp2]"=&r"(temp2), \209[temp3]"=&r"(temp3), [temp0]"+r"(temp0) \210: [pstreaks]"r"(pstreaks), [pcnts]"r"(pcnts), \211[streak]"r"(streak) \212: "memory" \213);214215// Returns the various RLE counts216static WEBP_INLINE void GetEntropyUnrefinedHelper(217uint32_t val, int i, uint32_t* const val_prev, int* const i_prev,218VP8LBitEntropy* const bit_entropy, VP8LStreaks* const stats) {219int* const pstreaks = &stats->streaks[0][0];220int* const pcnts = &stats->counts[0];221int temp0, temp1, temp2, temp3;222const int streak = i - *i_prev;223224// Gather info for the bit entropy.225if (*val_prev != 0) {226bit_entropy->sum += (*val_prev) * streak;227bit_entropy->nonzeros += streak;228bit_entropy->nonzero_code = *i_prev;229bit_entropy->entropy -= VP8LFastSLog2(*val_prev) * streak;230if (bit_entropy->max_val < *val_prev) {231bit_entropy->max_val = *val_prev;232}233}234235// Gather info for the Huffman cost.236temp0 = (*val_prev != 0);237HUFFMAN_COST_PASS238239*val_prev = val;240*i_prev = i;241}242243static void GetEntropyUnrefined_MIPS32(const uint32_t X[], int length,244VP8LBitEntropy* const bit_entropy,245VP8LStreaks* const stats) {246int i;247int i_prev = 0;248uint32_t x_prev = X[0];249250memset(stats, 0, sizeof(*stats));251VP8LBitEntropyInit(bit_entropy);252253for (i = 1; i < length; ++i) {254const uint32_t x = X[i];255if (x != x_prev) {256GetEntropyUnrefinedHelper(x, i, &x_prev, &i_prev, bit_entropy, stats);257}258}259GetEntropyUnrefinedHelper(0, i, &x_prev, &i_prev, bit_entropy, stats);260261bit_entropy->entropy += VP8LFastSLog2(bit_entropy->sum);262}263264static void GetCombinedEntropyUnrefined_MIPS32(const uint32_t X[],265const uint32_t Y[],266int length,267VP8LBitEntropy* const entropy,268VP8LStreaks* const stats) {269int i = 1;270int i_prev = 0;271uint32_t xy_prev = X[0] + Y[0];272273memset(stats, 0, sizeof(*stats));274VP8LBitEntropyInit(entropy);275276for (i = 1; i < length; ++i) {277const uint32_t xy = X[i] + Y[i];278if (xy != xy_prev) {279GetEntropyUnrefinedHelper(xy, i, &xy_prev, &i_prev, entropy, stats);280}281}282GetEntropyUnrefinedHelper(0, i, &xy_prev, &i_prev, entropy, stats);283284entropy->entropy += VP8LFastSLog2(entropy->sum);285}286287#define ASM_START \288__asm__ volatile( \289".set push \n\t" \290".set at \n\t" \291".set macro \n\t" \292"1: \n\t"293294// P2 = P0 + P1295// A..D - offsets296// E - temp variable to tell macro297// if pointer should be incremented298// literal_ and successive histograms could be unaligned299// so we must use ulw and usw300#define ADD_TO_OUT(A, B, C, D, E, P0, P1, P2) \301"ulw %[temp0], " #A "(%[" #P0 "]) \n\t" \302"ulw %[temp1], " #B "(%[" #P0 "]) \n\t" \303"ulw %[temp2], " #C "(%[" #P0 "]) \n\t" \304"ulw %[temp3], " #D "(%[" #P0 "]) \n\t" \305"ulw %[temp4], " #A "(%[" #P1 "]) \n\t" \306"ulw %[temp5], " #B "(%[" #P1 "]) \n\t" \307"ulw %[temp6], " #C "(%[" #P1 "]) \n\t" \308"ulw %[temp7], " #D "(%[" #P1 "]) \n\t" \309"addu %[temp4], %[temp4], %[temp0] \n\t" \310"addu %[temp5], %[temp5], %[temp1] \n\t" \311"addu %[temp6], %[temp6], %[temp2] \n\t" \312"addu %[temp7], %[temp7], %[temp3] \n\t" \313"addiu %[" #P0 "], %[" #P0 "], 16 \n\t" \314".if " #E " == 1 \n\t" \315"addiu %[" #P1 "], %[" #P1 "], 16 \n\t" \316".endif \n\t" \317"usw %[temp4], " #A "(%[" #P2 "]) \n\t" \318"usw %[temp5], " #B "(%[" #P2 "]) \n\t" \319"usw %[temp6], " #C "(%[" #P2 "]) \n\t" \320"usw %[temp7], " #D "(%[" #P2 "]) \n\t" \321"addiu %[" #P2 "], %[" #P2 "], 16 \n\t" \322"bne %[" #P0 "], %[LoopEnd], 1b \n\t" \323".set pop \n\t" \324325#define ASM_END_COMMON_0 \326: [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), \327[temp2]"=&r"(temp2), [temp3]"=&r"(temp3), \328[temp4]"=&r"(temp4), [temp5]"=&r"(temp5), \329[temp6]"=&r"(temp6), [temp7]"=&r"(temp7), \330[pa]"+r"(pa), [pout]"+r"(pout)331332#define ASM_END_COMMON_1 \333: [LoopEnd]"r"(LoopEnd) \334: "memory", "at" \335);336337#define ASM_END_0 \338ASM_END_COMMON_0 \339, [pb]"+r"(pb) \340ASM_END_COMMON_1341342#define ASM_END_1 \343ASM_END_COMMON_0 \344ASM_END_COMMON_1345346#define ADD_VECTOR(A, B, OUT, SIZE, EXTRA_SIZE) do { \347const uint32_t* pa = (const uint32_t*)(A); \348const uint32_t* pb = (const uint32_t*)(B); \349uint32_t* pout = (uint32_t*)(OUT); \350const uint32_t* const LoopEnd = pa + (SIZE); \351assert((SIZE) % 4 == 0); \352ASM_START \353ADD_TO_OUT(0, 4, 8, 12, 1, pa, pb, pout) \354ASM_END_0 \355if ((EXTRA_SIZE) > 0) { \356const int last = (EXTRA_SIZE); \357int i; \358for (i = 0; i < last; ++i) pout[i] = pa[i] + pb[i]; \359} \360} while (0)361362#define ADD_VECTOR_EQ(A, OUT, SIZE, EXTRA_SIZE) do { \363const uint32_t* pa = (const uint32_t*)(A); \364uint32_t* pout = (uint32_t*)(OUT); \365const uint32_t* const LoopEnd = pa + (SIZE); \366assert((SIZE) % 4 == 0); \367ASM_START \368ADD_TO_OUT(0, 4, 8, 12, 0, pa, pout, pout) \369ASM_END_1 \370if ((EXTRA_SIZE) > 0) { \371const int last = (EXTRA_SIZE); \372int i; \373for (i = 0; i < last; ++i) pout[i] += pa[i]; \374} \375} while (0)376377static void HistogramAdd_MIPS32(const VP8LHistogram* const a,378const VP8LHistogram* const b,379VP8LHistogram* const out) {380uint32_t temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;381const int extra_cache_size = VP8LHistogramNumCodes(a->palette_code_bits_)382- (NUM_LITERAL_CODES + NUM_LENGTH_CODES);383assert(a->palette_code_bits_ == b->palette_code_bits_);384385if (b != out) {386ADD_VECTOR(a->literal_, b->literal_, out->literal_,387NUM_LITERAL_CODES + NUM_LENGTH_CODES, extra_cache_size);388ADD_VECTOR(a->distance_, b->distance_, out->distance_,389NUM_DISTANCE_CODES, 0);390ADD_VECTOR(a->red_, b->red_, out->red_, NUM_LITERAL_CODES, 0);391ADD_VECTOR(a->blue_, b->blue_, out->blue_, NUM_LITERAL_CODES, 0);392ADD_VECTOR(a->alpha_, b->alpha_, out->alpha_, NUM_LITERAL_CODES, 0);393} else {394ADD_VECTOR_EQ(a->literal_, out->literal_,395NUM_LITERAL_CODES + NUM_LENGTH_CODES, extra_cache_size);396ADD_VECTOR_EQ(a->distance_, out->distance_, NUM_DISTANCE_CODES, 0);397ADD_VECTOR_EQ(a->red_, out->red_, NUM_LITERAL_CODES, 0);398ADD_VECTOR_EQ(a->blue_, out->blue_, NUM_LITERAL_CODES, 0);399ADD_VECTOR_EQ(a->alpha_, out->alpha_, NUM_LITERAL_CODES, 0);400}401}402403#undef ADD_VECTOR_EQ404#undef ADD_VECTOR405#undef ASM_END_1406#undef ASM_END_0407#undef ASM_END_COMMON_1408#undef ASM_END_COMMON_0409#undef ADD_TO_OUT410#undef ASM_START411412//------------------------------------------------------------------------------413// Entry point414415extern void VP8LEncDspInitMIPS32(void);416417WEBP_TSAN_IGNORE_FUNCTION void VP8LEncDspInitMIPS32(void) {418VP8LFastSLog2Slow = FastSLog2Slow_MIPS32;419VP8LFastLog2Slow = FastLog2Slow_MIPS32;420VP8LExtraCost = ExtraCost_MIPS32;421VP8LExtraCostCombined = ExtraCostCombined_MIPS32;422VP8LGetEntropyUnrefined = GetEntropyUnrefined_MIPS32;423VP8LGetCombinedEntropyUnrefined = GetCombinedEntropyUnrefined_MIPS32;424VP8LHistogramAdd = HistogramAdd_MIPS32;425}426427#else // !WEBP_USE_MIPS32428429WEBP_DSP_INIT_STUB(VP8LEncDspInitMIPS32)430431#endif // WEBP_USE_MIPS32432433434