Path: blob/21.2-virgl/src/util/fast_idiv_by_const.h
4545 views
/*1* Copyright © 2018 Advanced Micro Devices, Inc.2*3* Permission is hereby granted, free of charge, to any person obtaining a4* copy of this software and associated documentation files (the "Software"),5* to deal in the Software without restriction, including without limitation6* the rights to use, copy, modify, merge, publish, distribute, sublicense,7* and/or sell copies of the Software, and to permit persons to whom the8* Software is furnished to do so, subject to the following conditions:9*10* The above copyright notice and this permission notice (including the next11* paragraph) shall be included in all copies or substantial portions of the12* Software.13*14* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR15* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,16* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL17* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER18* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING19* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS20* IN THE SOFTWARE.21*/2223#ifndef FAST_IDIV_BY_CONST_H24#define FAST_IDIV_BY_CONST_H2526/* Imported from:27* https://raw.githubusercontent.com/ridiculousfish/libdivide/master/divide_by_constants_codegen_reference.c28*/2930#include <inttypes.h>31#include <limits.h>32#include <assert.h>3334#ifdef __cplusplus35extern "C" {36#endif3738/* Computes "magic info" for performing signed division by a fixed integer D.39* The type 'sint_t' is assumed to be defined as a signed integer type large40* enough to hold both the dividend and the divisor.41* Here >> is arithmetic (signed) shift, and >>> is logical shift.42*43* To emit code for n/d, rounding towards zero, use the following sequence:44*45* m = compute_signed_magic_info(D)46* emit("result = (m.multiplier * n) >> SINT_BITS");47* if d > 0 and m.multiplier < 0: emit("result += n")48* if d < 0 and m.multiplier > 0: emit("result -= n")49* if m.post_shift > 0: emit("result >>= m.shift")50* emit("result += (result < 0)")51*52* The shifts by SINT_BITS may be "free" if the high half of the full multiply53* is put in a separate register.54*55* The final add can of course be implemented via the sign bit, e.g.56* result += (result >>> (SINT_BITS - 1))57* or58* result -= (result >> (SINT_BITS - 1))59*60* This code is heavily indebted to Hacker's Delight by Henry Warren.61* See http://www.hackersdelight.org/HDcode/magic.c.txt62* Used with permission from http://www.hackersdelight.org/permissions.htm63*/6465struct util_fast_sdiv_info {66int64_t multiplier; /* the "magic number" multiplier */67unsigned shift; /* shift for the dividend after multiplying */68};6970struct util_fast_sdiv_info71util_compute_fast_sdiv_info(int64_t D, unsigned SINT_BITS);7273/* Computes "magic info" for performing unsigned division by a fixed positive74* integer D. UINT_BITS is the bit size at which the final "magic"75* calculation will be performed; it is assumed to be large enough to hold76* both the dividand and the divisor. num_bits can be set appropriately if n77* is known to be smaller than calc_bits; if this is not known then UINT_BITS78* for num_bits.79*80* Assume we have a hardware register of width UINT_BITS, a known constant D81* which is not zero and not a power of 2, and a variable n of width num_bits82* (which may be up to UINT_BITS). To emit code for n/d, use one of the two83* following sequences (here >>> refers to a logical bitshift):84*85* m = compute_unsigned_magic_info(D, num_bits)86* if m.pre_shift > 0: emit("n >>>= m.pre_shift")87* if m.increment: emit("n = saturated_increment(n)")88* emit("result = (m.multiplier * n) >>> UINT_BITS")89* if m.post_shift > 0: emit("result >>>= m.post_shift")90*91* or92*93* m = compute_unsigned_magic_info(D, num_bits)94* if m.pre_shift > 0: emit("n >>>= m.pre_shift")95* emit("result = m.multiplier * n")96* if m.increment: emit("result = result + m.multiplier")97* emit("result >>>= UINT_BITS")98* if m.post_shift > 0: emit("result >>>= m.post_shift")99*100* This second version works even if D is 1. The shifts by UINT_BITS may be101* "free" if the high half of the full multiply is put in a separate register.102*103* saturated_increment(n) means "increment n unless it would wrap to 0," i.e.104* if n == (1 << UINT_BITS)-1: result = n105* else: result = n+1106* A common way to implement this is with the carry bit. For example, on x86:107* add 1108* sbb 0109*110* Some invariants:111* 1: At least one of pre_shift and increment is zero112* 2: multiplier is never zero113*114* This code incorporates the "round down" optimization per ridiculous_fish.115*/116117struct util_fast_udiv_info {118uint64_t multiplier; /* the "magic number" multiplier */119unsigned pre_shift; /* shift for the dividend before multiplying */120unsigned post_shift; /* shift for the dividend after multiplying */121int increment; /* 0 or 1; if set then increment the numerator, using one of122the two strategies */123};124125struct util_fast_udiv_info126util_compute_fast_udiv_info(uint64_t D, unsigned num_bits, unsigned UINT_BITS);127128/* Below are possible options for dividing by a uniform in a shader where129* the divisor is constant but not known at compile time.130*/131132/* Full version. */133static inline uint32_t134util_fast_udiv32(uint32_t n, struct util_fast_udiv_info info)135{136n = n >> info.pre_shift;137/* If the divisor is not 1, you can instead use a 32-bit ADD that clamps138* to UINT_MAX. Dividing by 1 needs the full 64-bit ADD.139*140* If you have unsigned 64-bit MAD with 32-bit inputs, you can do:141* increment = increment ? multiplier : 0; // on the CPU142* (n * multiplier + increment) // on the GPU using unsigned 64-bit MAD143*/144n = (((uint64_t)n + info.increment) * info.multiplier) >> 32;145n = n >> info.post_shift;146return n;147}148149/* A little more efficient version if n != UINT_MAX, i.e. no unsigned150* wraparound in the computation.151*/152static inline uint32_t153util_fast_udiv32_nuw(uint32_t n, struct util_fast_udiv_info info)154{155assert(n != UINT32_MAX);156n = n >> info.pre_shift;157n = n + info.increment;158n = ((uint64_t)n * info.multiplier) >> 32;159n = n >> info.post_shift;160return n;161}162163/* Even faster version but both operands must be 31-bit unsigned integers164* and the divisor must be greater than 1.165*166* info must be computed with num_bits == 31.167*/168static inline uint32_t169util_fast_udiv32_u31_d_not_one(uint32_t n, struct util_fast_udiv_info info)170{171assert(info.pre_shift == 0);172assert(info.increment == 0);173n = ((uint64_t)n * info.multiplier) >> 32;174n = n >> info.post_shift;175return n;176}177178#ifdef __cplusplus179} /* extern C */180#endif181182#endif /* FAST_IDIV_BY_CONST_H */183184185