/* $NetBSD: fpu_mul.c,v 1.4 2005/12/11 12:18:42 christos Exp $ */12/*3* SPDX-License-Identifier: BSD-3-Clause4*5* Copyright (c) 1992, 19936* The Regents of the University of California. All rights reserved.7*8* This software was developed by the Computer Systems Engineering group9* at Lawrence Berkeley Laboratory under DARPA contract BG 91-66 and10* contributed to Berkeley.11*12* All advertising materials mentioning features or use of this software13* must display the following acknowledgement:14* This product includes software developed by the University of15* California, Lawrence Berkeley Laboratory.16*17* Redistribution and use in source and binary forms, with or without18* modification, are permitted provided that the following conditions19* are met:20* 1. Redistributions of source code must retain the above copyright21* notice, this list of conditions and the following disclaimer.22* 2. Redistributions in binary form must reproduce the above copyright23* notice, this list of conditions and the following disclaimer in the24* documentation and/or other materials provided with the distribution.25* 3. Neither the name of the University nor the names of its contributors26* may be used to endorse or promote products derived from this software27* without specific prior written permission.28*29* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND30* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE31* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE32* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE33* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL34* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS35* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)36* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT37* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY38* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF39* SUCH DAMAGE.40*/4142/*43* Perform an FPU multiply (return x * y).44*/4546#include <sys/types.h>47#include <sys/systm.h>4849#include <machine/fpu.h>5051#include <powerpc/fpu/fpu_arith.h>52#include <powerpc/fpu/fpu_emu.h>5354/*55* The multiplication algorithm for normal numbers is as follows:56*57* The fraction of the product is built in the usual stepwise fashion.58* Each step consists of shifting the accumulator right one bit59* (maintaining any guard bits) and, if the next bit in y is set,60* adding the multiplicand (x) to the accumulator. Then, in any case,61* we advance one bit leftward in y. Algorithmically:62*63* A = 0;64* for (bit = 0; bit < FP_NMANT; bit++) {65* sticky |= A & 1, A >>= 1;66* if (Y & (1 << bit))67* A += X;68* }69*70* (X and Y here represent the mantissas of x and y respectively.)71* The resultant accumulator (A) is the product's mantissa. It may72* be as large as 11.11111... in binary and hence may need to be73* shifted right, but at most one bit.74*75* Since we do not have efficient multiword arithmetic, we code the76* accumulator as four separate words, just like any other mantissa.77* We use local variables in the hope that this is faster than memory.78* We keep x->fp_mant in locals for the same reason.79*80* In the algorithm above, the bits in y are inspected one at a time.81* We will pick them up 32 at a time and then deal with those 32, one82* at a time. Note, however, that we know several things about y:83*84* - the guard and round bits at the bottom are sure to be zero;85*86* - often many low bits are zero (y is often from a single or double87* precision source);88*89* - bit FP_NMANT-1 is set, and FP_1*2 fits in a word.90*91* We can also test for 32-zero-bits swiftly. In this case, the center92* part of the loop---setting sticky, shifting A, and not adding---will93* run 32 times without adding X to A. We can do a 32-bit shift faster94* by simply moving words. Since zeros are common, we optimize this case.95* Furthermore, since A is initially zero, we can omit the shift as well96* until we reach a nonzero word.97*/98struct fpn *99fpu_mul(struct fpemu *fe)100{101struct fpn *x = &fe->fe_f1, *y = &fe->fe_f2;102u_int a3, a2, a1, a0, x3, x2, x1, x0, bit, m;103int sticky;104FPU_DECL_CARRY;105106/*107* Put the `heavier' operand on the right (see fpu_emu.h).108* Then we will have one of the following cases, taken in the109* following order:110*111* - y = NaN. Implied: if only one is a signalling NaN, y is.112* The result is y.113* - y = Inf. Implied: x != NaN (is 0, number, or Inf: the NaN114* case was taken care of earlier).115* If x = 0, the result is NaN. Otherwise the result116* is y, with its sign reversed if x is negative.117* - x = 0. Implied: y is 0 or number.118* The result is 0 (with XORed sign as usual).119* - other. Implied: both x and y are numbers.120* The result is x * y (XOR sign, multiply bits, add exponents).121*/122DPRINTF(FPE_REG, ("fpu_mul:\n"));123DUMPFPN(FPE_REG, x);124DUMPFPN(FPE_REG, y);125DPRINTF(FPE_REG, ("=>\n"));126127ORDER(x, y);128if (ISNAN(y)) {129y->fp_sign ^= x->fp_sign;130fe->fe_cx |= FPSCR_VXSNAN;131DUMPFPN(FPE_REG, y);132return (y);133}134if (ISINF(y)) {135if (ISZERO(x)) {136fe->fe_cx |= FPSCR_VXIMZ;137return (fpu_newnan(fe));138}139y->fp_sign ^= x->fp_sign;140DUMPFPN(FPE_REG, y);141return (y);142}143if (ISZERO(x)) {144x->fp_sign ^= y->fp_sign;145DUMPFPN(FPE_REG, x);146return (x);147}148149/*150* Setup. In the code below, the mask `m' will hold the current151* mantissa byte from y. The variable `bit' denotes the bit152* within m. We also define some macros to deal with everything.153*/154x3 = x->fp_mant[3];155x2 = x->fp_mant[2];156x1 = x->fp_mant[1];157x0 = x->fp_mant[0];158sticky = a3 = a2 = a1 = a0 = 0;159160#define ADD /* A += X */ \161FPU_ADDS(a3, a3, x3); \162FPU_ADDCS(a2, a2, x2); \163FPU_ADDCS(a1, a1, x1); \164FPU_ADDC(a0, a0, x0)165166#define SHR1 /* A >>= 1, with sticky */ \167sticky |= a3 & 1, a3 = (a3 >> 1) | (a2 << 31), \168a2 = (a2 >> 1) | (a1 << 31), a1 = (a1 >> 1) | (a0 << 31), a0 >>= 1169170#define SHR32 /* A >>= 32, with sticky */ \171sticky |= a3, a3 = a2, a2 = a1, a1 = a0, a0 = 0172173#define STEP /* each 1-bit step of the multiplication */ \174SHR1; if (bit & m) { ADD; }; bit <<= 1175176/*177* We are ready to begin. The multiply loop runs once for each178* of the four 32-bit words. Some words, however, are special.179* As noted above, the low order bits of Y are often zero. Even180* if not, the first loop can certainly skip the guard bits.181* The last word of y has its highest 1-bit in position FP_NMANT-1,182* so we stop the loop when we move past that bit.183*/184if ((m = y->fp_mant[3]) == 0) {185/* SHR32; */ /* unneeded since A==0 */186} else {187bit = 1 << FP_NG;188do {189STEP;190} while (bit != 0);191}192if ((m = y->fp_mant[2]) == 0) {193SHR32;194} else {195bit = 1;196do {197STEP;198} while (bit != 0);199}200if ((m = y->fp_mant[1]) == 0) {201SHR32;202} else {203bit = 1;204do {205STEP;206} while (bit != 0);207}208m = y->fp_mant[0]; /* definitely != 0 */209bit = 1;210do {211STEP;212} while (bit <= m);213214/*215* Done with mantissa calculation. Get exponent and handle216* 11.111...1 case, then put result in place. We reuse x since217* it already has the right class (FP_NUM).218*/219m = x->fp_exp + y->fp_exp;220if (a0 >= FP_2) {221SHR1;222m++;223}224x->fp_sign ^= y->fp_sign;225x->fp_exp = m;226x->fp_sticky = sticky;227x->fp_mant[3] = a3;228x->fp_mant[2] = a2;229x->fp_mant[1] = a1;230x->fp_mant[0] = a0;231232DUMPFPN(FPE_REG, x);233return (x);234}235236237