Path: blob/master/arch/m68k/math-emu/multi_arith.h
10817 views
/* multi_arith.h: multi-precision integer arithmetic functions, needed1to do extended-precision floating point.23(c) 1998 David Huggins-Daines.45Somewhat based on arch/alpha/math-emu/ieee-math.c, which is (c)6David Mosberger-Tang.78You may copy, modify, and redistribute this file under the terms of9the GNU General Public License, version 2, or any later version, at10your convenience. */1112/* Note:1314These are not general multi-precision math routines. Rather, they15implement the subset of integer arithmetic that we need in order to16multiply, divide, and normalize 128-bit unsigned mantissae. */1718#ifndef MULTI_ARITH_H19#define MULTI_ARITH_H2021#if 0 /* old code... */2223/* Unsigned only, because we don't need signs to multiply and divide. */24typedef unsigned int int128[4];2526/* Word order */27enum {28MSW128,29NMSW128,30NLSW128,31LSW12832};3334/* big-endian */35#define LO_WORD(ll) (((unsigned int *) &ll)[1])36#define HI_WORD(ll) (((unsigned int *) &ll)[0])3738/* Convenience functions to stuff various integer values into int128s */3940static inline void zero128(int128 a)41{42a[LSW128] = a[NLSW128] = a[NMSW128] = a[MSW128] = 0;43}4445/* Human-readable word order in the arguments */46static inline void set128(unsigned int i3, unsigned int i2, unsigned int i1,47unsigned int i0, int128 a)48{49a[LSW128] = i0;50a[NLSW128] = i1;51a[NMSW128] = i2;52a[MSW128] = i3;53}5455/* Convenience functions (for testing as well) */56static inline void int64_to_128(unsigned long long src, int128 dest)57{58dest[LSW128] = (unsigned int) src;59dest[NLSW128] = src >> 32;60dest[NMSW128] = dest[MSW128] = 0;61}6263static inline void int128_to_64(const int128 src, unsigned long long *dest)64{65*dest = src[LSW128] | (long long) src[NLSW128] << 32;66}6768static inline void put_i128(const int128 a)69{70printk("%08x %08x %08x %08x\n", a[MSW128], a[NMSW128],71a[NLSW128], a[LSW128]);72}7374/* Internal shifters:7576Note that these are only good for 0 < count < 32.77*/7879static inline void _lsl128(unsigned int count, int128 a)80{81a[MSW128] = (a[MSW128] << count) | (a[NMSW128] >> (32 - count));82a[NMSW128] = (a[NMSW128] << count) | (a[NLSW128] >> (32 - count));83a[NLSW128] = (a[NLSW128] << count) | (a[LSW128] >> (32 - count));84a[LSW128] <<= count;85}8687static inline void _lsr128(unsigned int count, int128 a)88{89a[LSW128] = (a[LSW128] >> count) | (a[NLSW128] << (32 - count));90a[NLSW128] = (a[NLSW128] >> count) | (a[NMSW128] << (32 - count));91a[NMSW128] = (a[NMSW128] >> count) | (a[MSW128] << (32 - count));92a[MSW128] >>= count;93}9495/* Should be faster, one would hope */9697static inline void lslone128(int128 a)98{99asm volatile ("lsl.l #1,%0\n"100"roxl.l #1,%1\n"101"roxl.l #1,%2\n"102"roxl.l #1,%3\n"103:104"=d" (a[LSW128]),105"=d"(a[NLSW128]),106"=d"(a[NMSW128]),107"=d"(a[MSW128])108:109"0"(a[LSW128]),110"1"(a[NLSW128]),111"2"(a[NMSW128]),112"3"(a[MSW128]));113}114115static inline void lsrone128(int128 a)116{117asm volatile ("lsr.l #1,%0\n"118"roxr.l #1,%1\n"119"roxr.l #1,%2\n"120"roxr.l #1,%3\n"121:122"=d" (a[MSW128]),123"=d"(a[NMSW128]),124"=d"(a[NLSW128]),125"=d"(a[LSW128])126:127"0"(a[MSW128]),128"1"(a[NMSW128]),129"2"(a[NLSW128]),130"3"(a[LSW128]));131}132133/* Generalized 128-bit shifters:134135These bit-shift to a multiple of 32, then move whole longwords. */136137static inline void lsl128(unsigned int count, int128 a)138{139int wordcount, i;140141if (count % 32)142_lsl128(count % 32, a);143144if (0 == (wordcount = count / 32))145return;146147/* argh, gak, endian-sensitive */148for (i = 0; i < 4 - wordcount; i++) {149a[i] = a[i + wordcount];150}151for (i = 3; i >= 4 - wordcount; --i) {152a[i] = 0;153}154}155156static inline void lsr128(unsigned int count, int128 a)157{158int wordcount, i;159160if (count % 32)161_lsr128(count % 32, a);162163if (0 == (wordcount = count / 32))164return;165166for (i = 3; i >= wordcount; --i) {167a[i] = a[i - wordcount];168}169for (i = 0; i < wordcount; i++) {170a[i] = 0;171}172}173174static inline int orl128(int a, int128 b)175{176b[LSW128] |= a;177}178179static inline int btsthi128(const int128 a)180{181return a[MSW128] & 0x80000000;182}183184/* test bits (numbered from 0 = LSB) up to and including "top" */185static inline int bftestlo128(int top, const int128 a)186{187int r = 0;188189if (top > 31)190r |= a[LSW128];191if (top > 63)192r |= a[NLSW128];193if (top > 95)194r |= a[NMSW128];195196r |= a[3 - (top / 32)] & ((1 << (top % 32 + 1)) - 1);197198return (r != 0);199}200201/* Aargh. We need these because GCC is broken */202/* FIXME: do them in assembly, for goodness' sake! */203static inline void mask64(int pos, unsigned long long *mask)204{205*mask = 0;206207if (pos < 32) {208LO_WORD(*mask) = (1 << pos) - 1;209return;210}211LO_WORD(*mask) = -1;212HI_WORD(*mask) = (1 << (pos - 32)) - 1;213}214215static inline void bset64(int pos, unsigned long long *dest)216{217/* This conditional will be optimized away. Thanks, GCC! */218if (pos < 32)219asm volatile ("bset %1,%0":"=m"220(LO_WORD(*dest)):"id"(pos));221else222asm volatile ("bset %1,%0":"=m"223(HI_WORD(*dest)):"id"(pos - 32));224}225226static inline int btst64(int pos, unsigned long long dest)227{228if (pos < 32)229return (0 != (LO_WORD(dest) & (1 << pos)));230else231return (0 != (HI_WORD(dest) & (1 << (pos - 32))));232}233234static inline void lsl64(int count, unsigned long long *dest)235{236if (count < 32) {237HI_WORD(*dest) = (HI_WORD(*dest) << count)238| (LO_WORD(*dest) >> count);239LO_WORD(*dest) <<= count;240return;241}242count -= 32;243HI_WORD(*dest) = LO_WORD(*dest) << count;244LO_WORD(*dest) = 0;245}246247static inline void lsr64(int count, unsigned long long *dest)248{249if (count < 32) {250LO_WORD(*dest) = (LO_WORD(*dest) >> count)251| (HI_WORD(*dest) << (32 - count));252HI_WORD(*dest) >>= count;253return;254}255count -= 32;256LO_WORD(*dest) = HI_WORD(*dest) >> count;257HI_WORD(*dest) = 0;258}259#endif260261static inline void fp_denormalize(struct fp_ext *reg, unsigned int cnt)262{263reg->exp += cnt;264265switch (cnt) {266case 0 ... 8:267reg->lowmant = reg->mant.m32[1] << (8 - cnt);268reg->mant.m32[1] = (reg->mant.m32[1] >> cnt) |269(reg->mant.m32[0] << (32 - cnt));270reg->mant.m32[0] = reg->mant.m32[0] >> cnt;271break;272case 9 ... 32:273reg->lowmant = reg->mant.m32[1] >> (cnt - 8);274if (reg->mant.m32[1] << (40 - cnt))275reg->lowmant |= 1;276reg->mant.m32[1] = (reg->mant.m32[1] >> cnt) |277(reg->mant.m32[0] << (32 - cnt));278reg->mant.m32[0] = reg->mant.m32[0] >> cnt;279break;280case 33 ... 39:281asm volatile ("bfextu %1{%2,#8},%0" : "=d" (reg->lowmant)282: "m" (reg->mant.m32[0]), "d" (64 - cnt));283if (reg->mant.m32[1] << (40 - cnt))284reg->lowmant |= 1;285reg->mant.m32[1] = reg->mant.m32[0] >> (cnt - 32);286reg->mant.m32[0] = 0;287break;288case 40 ... 71:289reg->lowmant = reg->mant.m32[0] >> (cnt - 40);290if ((reg->mant.m32[0] << (72 - cnt)) || reg->mant.m32[1])291reg->lowmant |= 1;292reg->mant.m32[1] = reg->mant.m32[0] >> (cnt - 32);293reg->mant.m32[0] = 0;294break;295default:296reg->lowmant = reg->mant.m32[0] || reg->mant.m32[1];297reg->mant.m32[0] = 0;298reg->mant.m32[1] = 0;299break;300}301}302303static inline int fp_overnormalize(struct fp_ext *reg)304{305int shift;306307if (reg->mant.m32[0]) {308asm ("bfffo %1{#0,#32},%0" : "=d" (shift) : "dm" (reg->mant.m32[0]));309reg->mant.m32[0] = (reg->mant.m32[0] << shift) | (reg->mant.m32[1] >> (32 - shift));310reg->mant.m32[1] = (reg->mant.m32[1] << shift);311} else {312asm ("bfffo %1{#0,#32},%0" : "=d" (shift) : "dm" (reg->mant.m32[1]));313reg->mant.m32[0] = (reg->mant.m32[1] << shift);314reg->mant.m32[1] = 0;315shift += 32;316}317318return shift;319}320321static inline int fp_addmant(struct fp_ext *dest, struct fp_ext *src)322{323int carry;324325/* we assume here, gcc only insert move and a clr instr */326asm volatile ("add.b %1,%0" : "=d,g" (dest->lowmant)327: "g,d" (src->lowmant), "0,0" (dest->lowmant));328asm volatile ("addx.l %1,%0" : "=d" (dest->mant.m32[1])329: "d" (src->mant.m32[1]), "0" (dest->mant.m32[1]));330asm volatile ("addx.l %1,%0" : "=d" (dest->mant.m32[0])331: "d" (src->mant.m32[0]), "0" (dest->mant.m32[0]));332asm volatile ("addx.l %0,%0" : "=d" (carry) : "0" (0));333334return carry;335}336337static inline int fp_addcarry(struct fp_ext *reg)338{339if (++reg->exp == 0x7fff) {340if (reg->mant.m64)341fp_set_sr(FPSR_EXC_INEX2);342reg->mant.m64 = 0;343fp_set_sr(FPSR_EXC_OVFL);344return 0;345}346reg->lowmant = (reg->mant.m32[1] << 7) | (reg->lowmant ? 1 : 0);347reg->mant.m32[1] = (reg->mant.m32[1] >> 1) |348(reg->mant.m32[0] << 31);349reg->mant.m32[0] = (reg->mant.m32[0] >> 1) | 0x80000000;350351return 1;352}353354static inline void fp_submant(struct fp_ext *dest, struct fp_ext *src1,355struct fp_ext *src2)356{357/* we assume here, gcc only insert move and a clr instr */358asm volatile ("sub.b %1,%0" : "=d,g" (dest->lowmant)359: "g,d" (src2->lowmant), "0,0" (src1->lowmant));360asm volatile ("subx.l %1,%0" : "=d" (dest->mant.m32[1])361: "d" (src2->mant.m32[1]), "0" (src1->mant.m32[1]));362asm volatile ("subx.l %1,%0" : "=d" (dest->mant.m32[0])363: "d" (src2->mant.m32[0]), "0" (src1->mant.m32[0]));364}365366#define fp_mul64(desth, destl, src1, src2) ({ \367asm ("mulu.l %2,%1:%0" : "=d" (destl), "=d" (desth) \368: "dm" (src1), "0" (src2)); \369})370#define fp_div64(quot, rem, srch, srcl, div) \371asm ("divu.l %2,%1:%0" : "=d" (quot), "=d" (rem) \372: "dm" (div), "1" (srch), "0" (srcl))373#define fp_add64(dest1, dest2, src1, src2) ({ \374asm ("add.l %1,%0" : "=d,dm" (dest2) \375: "dm,d" (src2), "0,0" (dest2)); \376asm ("addx.l %1,%0" : "=d" (dest1) \377: "d" (src1), "0" (dest1)); \378})379#define fp_addx96(dest, src) ({ \380/* we assume here, gcc only insert move and a clr instr */ \381asm volatile ("add.l %1,%0" : "=d,g" (dest->m32[2]) \382: "g,d" (temp.m32[1]), "0,0" (dest->m32[2])); \383asm volatile ("addx.l %1,%0" : "=d" (dest->m32[1]) \384: "d" (temp.m32[0]), "0" (dest->m32[1])); \385asm volatile ("addx.l %1,%0" : "=d" (dest->m32[0]) \386: "d" (0), "0" (dest->m32[0])); \387})388#define fp_sub64(dest, src) ({ \389asm ("sub.l %1,%0" : "=d,dm" (dest.m32[1]) \390: "dm,d" (src.m32[1]), "0,0" (dest.m32[1])); \391asm ("subx.l %1,%0" : "=d" (dest.m32[0]) \392: "d" (src.m32[0]), "0" (dest.m32[0])); \393})394#define fp_sub96c(dest, srch, srcm, srcl) ({ \395char carry; \396asm ("sub.l %1,%0" : "=d,dm" (dest.m32[2]) \397: "dm,d" (srcl), "0,0" (dest.m32[2])); \398asm ("subx.l %1,%0" : "=d" (dest.m32[1]) \399: "d" (srcm), "0" (dest.m32[1])); \400asm ("subx.l %2,%1; scs %0" : "=d" (carry), "=d" (dest.m32[0]) \401: "d" (srch), "1" (dest.m32[0])); \402carry; \403})404405static inline void fp_multiplymant(union fp_mant128 *dest, struct fp_ext *src1,406struct fp_ext *src2)407{408union fp_mant64 temp;409410fp_mul64(dest->m32[0], dest->m32[1], src1->mant.m32[0], src2->mant.m32[0]);411fp_mul64(dest->m32[2], dest->m32[3], src1->mant.m32[1], src2->mant.m32[1]);412413fp_mul64(temp.m32[0], temp.m32[1], src1->mant.m32[0], src2->mant.m32[1]);414fp_addx96(dest, temp);415416fp_mul64(temp.m32[0], temp.m32[1], src1->mant.m32[1], src2->mant.m32[0]);417fp_addx96(dest, temp);418}419420static inline void fp_dividemant(union fp_mant128 *dest, struct fp_ext *src,421struct fp_ext *div)422{423union fp_mant128 tmp;424union fp_mant64 tmp64;425unsigned long *mantp = dest->m32;426unsigned long fix, rem, first, dummy;427int i;428429/* the algorithm below requires dest to be smaller than div,430but both have the high bit set */431if (src->mant.m64 >= div->mant.m64) {432fp_sub64(src->mant, div->mant);433*mantp = 1;434} else435*mantp = 0;436mantp++;437438/* basic idea behind this algorithm: we can't divide two 64bit numbers439(AB/CD) directly, but we can calculate AB/C0, but this means this440quotient is off by C0/CD, so we have to multiply the first result441to fix the result, after that we have nearly the correct result442and only a few corrections are needed. */443444/* C0/CD can be precalculated, but it's an 64bit division again, but445we can make it a bit easier, by dividing first through C so we get44610/1D and now only a single shift and the value fits into 32bit. */447fix = 0x80000000;448dummy = div->mant.m32[1] / div->mant.m32[0] + 1;449dummy = (dummy >> 1) | fix;450fp_div64(fix, dummy, fix, 0, dummy);451fix--;452453for (i = 0; i < 3; i++, mantp++) {454if (src->mant.m32[0] == div->mant.m32[0]) {455fp_div64(first, rem, 0, src->mant.m32[1], div->mant.m32[0]);456457fp_mul64(*mantp, dummy, first, fix);458*mantp += fix;459} else {460fp_div64(first, rem, src->mant.m32[0], src->mant.m32[1], div->mant.m32[0]);461462fp_mul64(*mantp, dummy, first, fix);463}464465fp_mul64(tmp.m32[0], tmp.m32[1], div->mant.m32[0], first - *mantp);466fp_add64(tmp.m32[0], tmp.m32[1], 0, rem);467tmp.m32[2] = 0;468469fp_mul64(tmp64.m32[0], tmp64.m32[1], *mantp, div->mant.m32[1]);470fp_sub96c(tmp, 0, tmp64.m32[0], tmp64.m32[1]);471472src->mant.m32[0] = tmp.m32[1];473src->mant.m32[1] = tmp.m32[2];474475while (!fp_sub96c(tmp, 0, div->mant.m32[0], div->mant.m32[1])) {476src->mant.m32[0] = tmp.m32[1];477src->mant.m32[1] = tmp.m32[2];478*mantp += 1;479}480}481}482483#if 0484static inline unsigned int fp_fls128(union fp_mant128 *src)485{486unsigned long data;487unsigned int res, off;488489if ((data = src->m32[0]))490off = 0;491else if ((data = src->m32[1]))492off = 32;493else if ((data = src->m32[2]))494off = 64;495else if ((data = src->m32[3]))496off = 96;497else498return 128;499500asm ("bfffo %1{#0,#32},%0" : "=d" (res) : "dm" (data));501return res + off;502}503504static inline void fp_shiftmant128(union fp_mant128 *src, int shift)505{506unsigned long sticky;507508switch (shift) {509case 0:510return;511case 1:512asm volatile ("lsl.l #1,%0"513: "=d" (src->m32[3]) : "0" (src->m32[3]));514asm volatile ("roxl.l #1,%0"515: "=d" (src->m32[2]) : "0" (src->m32[2]));516asm volatile ("roxl.l #1,%0"517: "=d" (src->m32[1]) : "0" (src->m32[1]));518asm volatile ("roxl.l #1,%0"519: "=d" (src->m32[0]) : "0" (src->m32[0]));520return;521case 2 ... 31:522src->m32[0] = (src->m32[0] << shift) | (src->m32[1] >> (32 - shift));523src->m32[1] = (src->m32[1] << shift) | (src->m32[2] >> (32 - shift));524src->m32[2] = (src->m32[2] << shift) | (src->m32[3] >> (32 - shift));525src->m32[3] = (src->m32[3] << shift);526return;527case 32 ... 63:528shift -= 32;529src->m32[0] = (src->m32[1] << shift) | (src->m32[2] >> (32 - shift));530src->m32[1] = (src->m32[2] << shift) | (src->m32[3] >> (32 - shift));531src->m32[2] = (src->m32[3] << shift);532src->m32[3] = 0;533return;534case 64 ... 95:535shift -= 64;536src->m32[0] = (src->m32[2] << shift) | (src->m32[3] >> (32 - shift));537src->m32[1] = (src->m32[3] << shift);538src->m32[2] = src->m32[3] = 0;539return;540case 96 ... 127:541shift -= 96;542src->m32[0] = (src->m32[3] << shift);543src->m32[1] = src->m32[2] = src->m32[3] = 0;544return;545case -31 ... -1:546shift = -shift;547sticky = 0;548if (src->m32[3] << (32 - shift))549sticky = 1;550src->m32[3] = (src->m32[3] >> shift) | (src->m32[2] << (32 - shift)) | sticky;551src->m32[2] = (src->m32[2] >> shift) | (src->m32[1] << (32 - shift));552src->m32[1] = (src->m32[1] >> shift) | (src->m32[0] << (32 - shift));553src->m32[0] = (src->m32[0] >> shift);554return;555case -63 ... -32:556shift = -shift - 32;557sticky = 0;558if ((src->m32[2] << (32 - shift)) || src->m32[3])559sticky = 1;560src->m32[3] = (src->m32[2] >> shift) | (src->m32[1] << (32 - shift)) | sticky;561src->m32[2] = (src->m32[1] >> shift) | (src->m32[0] << (32 - shift));562src->m32[1] = (src->m32[0] >> shift);563src->m32[0] = 0;564return;565case -95 ... -64:566shift = -shift - 64;567sticky = 0;568if ((src->m32[1] << (32 - shift)) || src->m32[2] || src->m32[3])569sticky = 1;570src->m32[3] = (src->m32[1] >> shift) | (src->m32[0] << (32 - shift)) | sticky;571src->m32[2] = (src->m32[0] >> shift);572src->m32[1] = src->m32[0] = 0;573return;574case -127 ... -96:575shift = -shift - 96;576sticky = 0;577if ((src->m32[0] << (32 - shift)) || src->m32[1] || src->m32[2] || src->m32[3])578sticky = 1;579src->m32[3] = (src->m32[0] >> shift) | sticky;580src->m32[2] = src->m32[1] = src->m32[0] = 0;581return;582}583584if (shift < 0 && (src->m32[0] || src->m32[1] || src->m32[2] || src->m32[3]))585src->m32[3] = 1;586else587src->m32[3] = 0;588src->m32[2] = 0;589src->m32[1] = 0;590src->m32[0] = 0;591}592#endif593594static inline void fp_putmant128(struct fp_ext *dest, union fp_mant128 *src,595int shift)596{597unsigned long tmp;598599switch (shift) {600case 0:601dest->mant.m64 = src->m64[0];602dest->lowmant = src->m32[2] >> 24;603if (src->m32[3] || (src->m32[2] << 8))604dest->lowmant |= 1;605break;606case 1:607asm volatile ("lsl.l #1,%0"608: "=d" (tmp) : "0" (src->m32[2]));609asm volatile ("roxl.l #1,%0"610: "=d" (dest->mant.m32[1]) : "0" (src->m32[1]));611asm volatile ("roxl.l #1,%0"612: "=d" (dest->mant.m32[0]) : "0" (src->m32[0]));613dest->lowmant = tmp >> 24;614if (src->m32[3] || (tmp << 8))615dest->lowmant |= 1;616break;617case 31:618asm volatile ("lsr.l #1,%1; roxr.l #1,%0"619: "=d" (dest->mant.m32[0])620: "d" (src->m32[0]), "0" (src->m32[1]));621asm volatile ("roxr.l #1,%0"622: "=d" (dest->mant.m32[1]) : "0" (src->m32[2]));623asm volatile ("roxr.l #1,%0"624: "=d" (tmp) : "0" (src->m32[3]));625dest->lowmant = tmp >> 24;626if (src->m32[3] << 7)627dest->lowmant |= 1;628break;629case 32:630dest->mant.m32[0] = src->m32[1];631dest->mant.m32[1] = src->m32[2];632dest->lowmant = src->m32[3] >> 24;633if (src->m32[3] << 8)634dest->lowmant |= 1;635break;636}637}638639#if 0 /* old code... */640static inline int fls(unsigned int a)641{642int r;643644asm volatile ("bfffo %1{#0,#32},%0"645: "=d" (r) : "md" (a));646return r;647}648649/* fls = "find last set" (cf. ffs(3)) */650static inline int fls128(const int128 a)651{652if (a[MSW128])653return fls(a[MSW128]);654if (a[NMSW128])655return fls(a[NMSW128]) + 32;656/* XXX: it probably never gets beyond this point in actual657use, but that's indicative of a more general problem in the658algorithm (i.e. as per the actual 68881 implementation, we659really only need at most 67 bits of precision [plus660overflow]) so I'm not going to fix it. */661if (a[NLSW128])662return fls(a[NLSW128]) + 64;663if (a[LSW128])664return fls(a[LSW128]) + 96;665else666return -1;667}668669static inline int zerop128(const int128 a)670{671return !(a[LSW128] | a[NLSW128] | a[NMSW128] | a[MSW128]);672}673674static inline int nonzerop128(const int128 a)675{676return (a[LSW128] | a[NLSW128] | a[NMSW128] | a[MSW128]);677}678679/* Addition and subtraction */680/* Do these in "pure" assembly, because "extended" asm is unmanageable681here */682static inline void add128(const int128 a, int128 b)683{684/* rotating carry flags */685unsigned int carry[2];686687carry[0] = a[LSW128] > (0xffffffff - b[LSW128]);688b[LSW128] += a[LSW128];689690carry[1] = a[NLSW128] > (0xffffffff - b[NLSW128] - carry[0]);691b[NLSW128] = a[NLSW128] + b[NLSW128] + carry[0];692693carry[0] = a[NMSW128] > (0xffffffff - b[NMSW128] - carry[1]);694b[NMSW128] = a[NMSW128] + b[NMSW128] + carry[1];695696b[MSW128] = a[MSW128] + b[MSW128] + carry[0];697}698699/* Note: assembler semantics: "b -= a" */700static inline void sub128(const int128 a, int128 b)701{702/* rotating borrow flags */703unsigned int borrow[2];704705borrow[0] = b[LSW128] < a[LSW128];706b[LSW128] -= a[LSW128];707708borrow[1] = b[NLSW128] < a[NLSW128] + borrow[0];709b[NLSW128] = b[NLSW128] - a[NLSW128] - borrow[0];710711borrow[0] = b[NMSW128] < a[NMSW128] + borrow[1];712b[NMSW128] = b[NMSW128] - a[NMSW128] - borrow[1];713714b[MSW128] = b[MSW128] - a[MSW128] - borrow[0];715}716717/* Poor man's 64-bit expanding multiply */718static inline void mul64(unsigned long long a, unsigned long long b, int128 c)719{720unsigned long long acc;721int128 acc128;722723zero128(acc128);724zero128(c);725726/* first the low words */727if (LO_WORD(a) && LO_WORD(b)) {728acc = (long long) LO_WORD(a) * LO_WORD(b);729c[NLSW128] = HI_WORD(acc);730c[LSW128] = LO_WORD(acc);731}732/* Next the high words */733if (HI_WORD(a) && HI_WORD(b)) {734acc = (long long) HI_WORD(a) * HI_WORD(b);735c[MSW128] = HI_WORD(acc);736c[NMSW128] = LO_WORD(acc);737}738/* The middle words */739if (LO_WORD(a) && HI_WORD(b)) {740acc = (long long) LO_WORD(a) * HI_WORD(b);741acc128[NMSW128] = HI_WORD(acc);742acc128[NLSW128] = LO_WORD(acc);743add128(acc128, c);744}745/* The first and last words */746if (HI_WORD(a) && LO_WORD(b)) {747acc = (long long) HI_WORD(a) * LO_WORD(b);748acc128[NMSW128] = HI_WORD(acc);749acc128[NLSW128] = LO_WORD(acc);750add128(acc128, c);751}752}753754/* Note: unsigned */755static inline int cmp128(int128 a, int128 b)756{757if (a[MSW128] < b[MSW128])758return -1;759if (a[MSW128] > b[MSW128])760return 1;761if (a[NMSW128] < b[NMSW128])762return -1;763if (a[NMSW128] > b[NMSW128])764return 1;765if (a[NLSW128] < b[NLSW128])766return -1;767if (a[NLSW128] > b[NLSW128])768return 1;769770return (signed) a[LSW128] - b[LSW128];771}772773inline void div128(int128 a, int128 b, int128 c)774{775int128 mask;776777/* Algorithm:778779Shift the divisor until it's at least as big as the780dividend, keeping track of the position to which we've781shifted it, i.e. the power of 2 which we've multiplied it782by.783784Then, for this power of 2 (the mask), and every one smaller785than it, subtract the mask from the dividend and add it to786the quotient until the dividend is smaller than the raised787divisor. At this point, divide the dividend and the mask788by 2 (i.e. shift one place to the right). Lather, rinse,789and repeat, until there are no more powers of 2 left. */790791/* FIXME: needless to say, there's room for improvement here too. */792793/* Shift up */794/* XXX: since it just has to be "at least as big", we can795probably eliminate this horribly wasteful loop. I will796have to prove this first, though */797set128(0, 0, 0, 1, mask);798while (cmp128(b, a) < 0 && !btsthi128(b)) {799lslone128(b);800lslone128(mask);801}802803/* Shift down */804zero128(c);805do {806if (cmp128(a, b) >= 0) {807sub128(b, a);808add128(mask, c);809}810lsrone128(mask);811lsrone128(b);812} while (nonzerop128(mask));813814/* The remainder is in a... */815}816#endif817818#endif /* MULTI_ARITH_H */819820821