Path: blob/21.2-virgl/src/util/fast_idiv_by_const.c
4545 views
/*1* Copyright © 2018 Advanced Micro Devices, Inc.2*3* Permission is hereby granted, free of charge, to any person obtaining a4* copy of this software and associated documentation files (the "Software"),5* to deal in the Software without restriction, including without limitation6* the rights to use, copy, modify, merge, publish, distribute, sublicense,7* and/or sell copies of the Software, and to permit persons to whom the8* Software is furnished to do so, subject to the following conditions:9*10* The above copyright notice and this permission notice (including the next11* paragraph) shall be included in all copies or substantial portions of the12* Software.13*14* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR15* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,16* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL17* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER18* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING19* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS20* IN THE SOFTWARE.21*/2223/* Imported from:24* https://raw.githubusercontent.com/ridiculousfish/libdivide/master/divide_by_constants_codegen_reference.c25* Paper:26* http://ridiculousfish.com/files/faster_unsigned_division_by_constants.pdf27*28* The author, ridiculous_fish, wrote:29*30* ''Reference implementations of computing and using the "magic number"31* approach to dividing by constants, including codegen instructions.32* The unsigned division incorporates the "round down" optimization per33* ridiculous_fish.34*35* This is free and unencumbered software. Any copyright is dedicated36* to the Public Domain.''37*/3839#include "fast_idiv_by_const.h"40#include "u_math.h"41#include <limits.h>42#include <assert.h>4344struct util_fast_udiv_info45util_compute_fast_udiv_info(uint64_t D, unsigned num_bits, unsigned UINT_BITS)46{47/* The numerator must fit in a uint64_t */48assert(num_bits > 0 && num_bits <= UINT_BITS);49assert(D != 0);5051/* The eventual result */52struct util_fast_udiv_info result;5354if (util_is_power_of_two_or_zero64(D)) {55unsigned div_shift = util_logbase2_64(D);5657if (div_shift) {58/* Dividing by a power of two. */59result.multiplier = 1ull << (UINT_BITS - div_shift);60result.pre_shift = 0;61result.post_shift = 0;62result.increment = 0;63return result;64} else {65/* Dividing by 1. */66/* Assuming: floor((num + 1) * (2^32 - 1) / 2^32) = num */67result.multiplier = UINT_BITS == 64 ? UINT64_MAX :68(1ull << UINT_BITS) - 1;69result.pre_shift = 0;70result.post_shift = 0;71result.increment = 1;72return result;73}74}7576/* The extra shift implicit in the difference between UINT_BITS and num_bits77*/78const unsigned extra_shift = UINT_BITS - num_bits;7980/* The initial power of 2 is one less than the first one that can possibly81* work.82*/83const uint64_t initial_power_of_2 = (uint64_t)1 << (UINT_BITS-1);8485/* The remainder and quotient of our power of 2 divided by d */86uint64_t quotient = initial_power_of_2 / D;87uint64_t remainder = initial_power_of_2 % D;8889/* ceil(log_2 D) */90unsigned ceil_log_2_D;9192/* The magic info for the variant "round down" algorithm */93uint64_t down_multiplier = 0;94unsigned down_exponent = 0;95int has_magic_down = 0;9697/* Compute ceil(log_2 D) */98ceil_log_2_D = 0;99uint64_t tmp;100for (tmp = D; tmp > 0; tmp >>= 1)101ceil_log_2_D += 1;102103104/* Begin a loop that increments the exponent, until we find a power of 2105* that works.106*/107unsigned exponent;108for (exponent = 0; ; exponent++) {109/* Quotient and remainder is from previous exponent; compute it for this110* exponent.111*/112if (remainder >= D - remainder) {113/* Doubling remainder will wrap around D */114quotient = quotient * 2 + 1;115remainder = remainder * 2 - D;116} else {117/* Remainder will not wrap */118quotient = quotient * 2;119remainder = remainder * 2;120}121122/* We're done if this exponent works for the round_up algorithm.123* Note that exponent may be larger than the maximum shift supported,124* so the check for >= ceil_log_2_D is critical.125*/126if ((exponent + extra_shift >= ceil_log_2_D) ||127(D - remainder) <= ((uint64_t)1 << (exponent + extra_shift)))128break;129130/* Set magic_down if we have not set it yet and this exponent works for131* the round_down algorithm132*/133if (!has_magic_down &&134remainder <= ((uint64_t)1 << (exponent + extra_shift))) {135has_magic_down = 1;136down_multiplier = quotient;137down_exponent = exponent;138}139}140141if (exponent < ceil_log_2_D) {142/* magic_up is efficient */143result.multiplier = quotient + 1;144result.pre_shift = 0;145result.post_shift = exponent;146result.increment = 0;147} else if (D & 1) {148/* Odd divisor, so use magic_down, which must have been set */149assert(has_magic_down);150result.multiplier = down_multiplier;151result.pre_shift = 0;152result.post_shift = down_exponent;153result.increment = 1;154} else {155/* Even divisor, so use a prefix-shifted dividend */156unsigned pre_shift = 0;157uint64_t shifted_D = D;158while ((shifted_D & 1) == 0) {159shifted_D >>= 1;160pre_shift += 1;161}162result = util_compute_fast_udiv_info(shifted_D, num_bits - pre_shift,163UINT_BITS);164/* expect no increment or pre_shift in this path */165assert(result.increment == 0 && result.pre_shift == 0);166result.pre_shift = pre_shift;167}168return result;169}170171static inline int64_t172sign_extend(int64_t x, unsigned SINT_BITS)173{174return (int64_t)((uint64_t)x << (64 - SINT_BITS)) >> (64 - SINT_BITS);175}176177struct util_fast_sdiv_info178util_compute_fast_sdiv_info(int64_t D, unsigned SINT_BITS)179{180/* D must not be zero. */181assert(D != 0);182/* The result is not correct for these divisors. */183assert(D != 1 && D != -1);184185/* Our result */186struct util_fast_sdiv_info result;187188/* Absolute value of D (we know D is not the most negative value since189* that's a power of 2)190*/191const uint64_t abs_d = (D < 0 ? -D : D);192193/* The initial power of 2 is one less than the first one that can possibly194* work */195/* "two31" in Warren */196unsigned exponent = SINT_BITS - 1;197const uint64_t initial_power_of_2 = (uint64_t)1 << exponent;198199/* Compute the absolute value of our "test numerator,"200* which is the largest dividend whose remainder with d is d-1.201* This is called anc in Warren.202*/203const uint64_t tmp = initial_power_of_2 + (D < 0);204const uint64_t abs_test_numer = tmp - 1 - tmp % abs_d;205206/* Initialize our quotients and remainders (q1, r1, q2, r2 in Warren) */207uint64_t quotient1 = initial_power_of_2 / abs_test_numer;208uint64_t remainder1 = initial_power_of_2 % abs_test_numer;209uint64_t quotient2 = initial_power_of_2 / abs_d;210uint64_t remainder2 = initial_power_of_2 % abs_d;211uint64_t delta;212213/* Begin our loop */214do {215/* Update the exponent */216exponent++;217218/* Update quotient1 and remainder1 */219quotient1 *= 2;220remainder1 *= 2;221if (remainder1 >= abs_test_numer) {222quotient1 += 1;223remainder1 -= abs_test_numer;224}225226/* Update quotient2 and remainder2 */227quotient2 *= 2;228remainder2 *= 2;229if (remainder2 >= abs_d) {230quotient2 += 1;231remainder2 -= abs_d;232}233234/* Keep going as long as (2**exponent) / abs_d <= delta */235delta = abs_d - remainder2;236} while (quotient1 < delta || (quotient1 == delta && remainder1 == 0));237238result.multiplier = sign_extend(quotient2 + 1, SINT_BITS);239if (D < 0) result.multiplier = -result.multiplier;240result.shift = exponent - SINT_BITS;241return result;242}243244245