Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
awilliam
GitHub Repository: awilliam/linux-vfio
Path: blob/master/arch/m68k/math-emu/multi_arith.h
10817 views
1
/* multi_arith.h: multi-precision integer arithmetic functions, needed
2
to do extended-precision floating point.
3
4
(c) 1998 David Huggins-Daines.
5
6
Somewhat based on arch/alpha/math-emu/ieee-math.c, which is (c)
7
David Mosberger-Tang.
8
9
You may copy, modify, and redistribute this file under the terms of
10
the GNU General Public License, version 2, or any later version, at
11
your convenience. */
12
13
/* Note:
14
15
These are not general multi-precision math routines. Rather, they
16
implement the subset of integer arithmetic that we need in order to
17
multiply, divide, and normalize 128-bit unsigned mantissae. */
18
19
#ifndef MULTI_ARITH_H
20
#define MULTI_ARITH_H
21
22
#if 0 /* old code... */
23
24
/* Unsigned only, because we don't need signs to multiply and divide. */
25
typedef unsigned int int128[4];
26
27
/* Word order */
28
enum {
29
MSW128,
30
NMSW128,
31
NLSW128,
32
LSW128
33
};
34
35
/* big-endian */
36
#define LO_WORD(ll) (((unsigned int *) &ll)[1])
37
#define HI_WORD(ll) (((unsigned int *) &ll)[0])
38
39
/* Convenience functions to stuff various integer values into int128s */
40
41
static inline void zero128(int128 a)
42
{
43
a[LSW128] = a[NLSW128] = a[NMSW128] = a[MSW128] = 0;
44
}
45
46
/* Human-readable word order in the arguments */
47
static inline void set128(unsigned int i3, unsigned int i2, unsigned int i1,
48
unsigned int i0, int128 a)
49
{
50
a[LSW128] = i0;
51
a[NLSW128] = i1;
52
a[NMSW128] = i2;
53
a[MSW128] = i3;
54
}
55
56
/* Convenience functions (for testing as well) */
57
static inline void int64_to_128(unsigned long long src, int128 dest)
58
{
59
dest[LSW128] = (unsigned int) src;
60
dest[NLSW128] = src >> 32;
61
dest[NMSW128] = dest[MSW128] = 0;
62
}
63
64
static inline void int128_to_64(const int128 src, unsigned long long *dest)
65
{
66
*dest = src[LSW128] | (long long) src[NLSW128] << 32;
67
}
68
69
static inline void put_i128(const int128 a)
70
{
71
printk("%08x %08x %08x %08x\n", a[MSW128], a[NMSW128],
72
a[NLSW128], a[LSW128]);
73
}
74
75
/* Internal shifters:
76
77
Note that these are only good for 0 < count < 32.
78
*/
79
80
static inline void _lsl128(unsigned int count, int128 a)
81
{
82
a[MSW128] = (a[MSW128] << count) | (a[NMSW128] >> (32 - count));
83
a[NMSW128] = (a[NMSW128] << count) | (a[NLSW128] >> (32 - count));
84
a[NLSW128] = (a[NLSW128] << count) | (a[LSW128] >> (32 - count));
85
a[LSW128] <<= count;
86
}
87
88
static inline void _lsr128(unsigned int count, int128 a)
89
{
90
a[LSW128] = (a[LSW128] >> count) | (a[NLSW128] << (32 - count));
91
a[NLSW128] = (a[NLSW128] >> count) | (a[NMSW128] << (32 - count));
92
a[NMSW128] = (a[NMSW128] >> count) | (a[MSW128] << (32 - count));
93
a[MSW128] >>= count;
94
}
95
96
/* Should be faster, one would hope */
97
98
static inline void lslone128(int128 a)
99
{
100
asm volatile ("lsl.l #1,%0\n"
101
"roxl.l #1,%1\n"
102
"roxl.l #1,%2\n"
103
"roxl.l #1,%3\n"
104
:
105
"=d" (a[LSW128]),
106
"=d"(a[NLSW128]),
107
"=d"(a[NMSW128]),
108
"=d"(a[MSW128])
109
:
110
"0"(a[LSW128]),
111
"1"(a[NLSW128]),
112
"2"(a[NMSW128]),
113
"3"(a[MSW128]));
114
}
115
116
static inline void lsrone128(int128 a)
117
{
118
asm volatile ("lsr.l #1,%0\n"
119
"roxr.l #1,%1\n"
120
"roxr.l #1,%2\n"
121
"roxr.l #1,%3\n"
122
:
123
"=d" (a[MSW128]),
124
"=d"(a[NMSW128]),
125
"=d"(a[NLSW128]),
126
"=d"(a[LSW128])
127
:
128
"0"(a[MSW128]),
129
"1"(a[NMSW128]),
130
"2"(a[NLSW128]),
131
"3"(a[LSW128]));
132
}
133
134
/* Generalized 128-bit shifters:
135
136
These bit-shift to a multiple of 32, then move whole longwords. */
137
138
static inline void lsl128(unsigned int count, int128 a)
139
{
140
int wordcount, i;
141
142
if (count % 32)
143
_lsl128(count % 32, a);
144
145
if (0 == (wordcount = count / 32))
146
return;
147
148
/* argh, gak, endian-sensitive */
149
for (i = 0; i < 4 - wordcount; i++) {
150
a[i] = a[i + wordcount];
151
}
152
for (i = 3; i >= 4 - wordcount; --i) {
153
a[i] = 0;
154
}
155
}
156
157
static inline void lsr128(unsigned int count, int128 a)
158
{
159
int wordcount, i;
160
161
if (count % 32)
162
_lsr128(count % 32, a);
163
164
if (0 == (wordcount = count / 32))
165
return;
166
167
for (i = 3; i >= wordcount; --i) {
168
a[i] = a[i - wordcount];
169
}
170
for (i = 0; i < wordcount; i++) {
171
a[i] = 0;
172
}
173
}
174
175
static inline int orl128(int a, int128 b)
176
{
177
b[LSW128] |= a;
178
}
179
180
static inline int btsthi128(const int128 a)
181
{
182
return a[MSW128] & 0x80000000;
183
}
184
185
/* test bits (numbered from 0 = LSB) up to and including "top" */
186
static inline int bftestlo128(int top, const int128 a)
187
{
188
int r = 0;
189
190
if (top > 31)
191
r |= a[LSW128];
192
if (top > 63)
193
r |= a[NLSW128];
194
if (top > 95)
195
r |= a[NMSW128];
196
197
r |= a[3 - (top / 32)] & ((1 << (top % 32 + 1)) - 1);
198
199
return (r != 0);
200
}
201
202
/* Aargh. We need these because GCC is broken */
203
/* FIXME: do them in assembly, for goodness' sake! */
204
static inline void mask64(int pos, unsigned long long *mask)
205
{
206
*mask = 0;
207
208
if (pos < 32) {
209
LO_WORD(*mask) = (1 << pos) - 1;
210
return;
211
}
212
LO_WORD(*mask) = -1;
213
HI_WORD(*mask) = (1 << (pos - 32)) - 1;
214
}
215
216
static inline void bset64(int pos, unsigned long long *dest)
217
{
218
/* This conditional will be optimized away. Thanks, GCC! */
219
if (pos < 32)
220
asm volatile ("bset %1,%0":"=m"
221
(LO_WORD(*dest)):"id"(pos));
222
else
223
asm volatile ("bset %1,%0":"=m"
224
(HI_WORD(*dest)):"id"(pos - 32));
225
}
226
227
static inline int btst64(int pos, unsigned long long dest)
228
{
229
if (pos < 32)
230
return (0 != (LO_WORD(dest) & (1 << pos)));
231
else
232
return (0 != (HI_WORD(dest) & (1 << (pos - 32))));
233
}
234
235
static inline void lsl64(int count, unsigned long long *dest)
236
{
237
if (count < 32) {
238
HI_WORD(*dest) = (HI_WORD(*dest) << count)
239
| (LO_WORD(*dest) >> count);
240
LO_WORD(*dest) <<= count;
241
return;
242
}
243
count -= 32;
244
HI_WORD(*dest) = LO_WORD(*dest) << count;
245
LO_WORD(*dest) = 0;
246
}
247
248
static inline void lsr64(int count, unsigned long long *dest)
249
{
250
if (count < 32) {
251
LO_WORD(*dest) = (LO_WORD(*dest) >> count)
252
| (HI_WORD(*dest) << (32 - count));
253
HI_WORD(*dest) >>= count;
254
return;
255
}
256
count -= 32;
257
LO_WORD(*dest) = HI_WORD(*dest) >> count;
258
HI_WORD(*dest) = 0;
259
}
260
#endif
261
262
static inline void fp_denormalize(struct fp_ext *reg, unsigned int cnt)
263
{
264
reg->exp += cnt;
265
266
switch (cnt) {
267
case 0 ... 8:
268
reg->lowmant = reg->mant.m32[1] << (8 - cnt);
269
reg->mant.m32[1] = (reg->mant.m32[1] >> cnt) |
270
(reg->mant.m32[0] << (32 - cnt));
271
reg->mant.m32[0] = reg->mant.m32[0] >> cnt;
272
break;
273
case 9 ... 32:
274
reg->lowmant = reg->mant.m32[1] >> (cnt - 8);
275
if (reg->mant.m32[1] << (40 - cnt))
276
reg->lowmant |= 1;
277
reg->mant.m32[1] = (reg->mant.m32[1] >> cnt) |
278
(reg->mant.m32[0] << (32 - cnt));
279
reg->mant.m32[0] = reg->mant.m32[0] >> cnt;
280
break;
281
case 33 ... 39:
282
asm volatile ("bfextu %1{%2,#8},%0" : "=d" (reg->lowmant)
283
: "m" (reg->mant.m32[0]), "d" (64 - cnt));
284
if (reg->mant.m32[1] << (40 - cnt))
285
reg->lowmant |= 1;
286
reg->mant.m32[1] = reg->mant.m32[0] >> (cnt - 32);
287
reg->mant.m32[0] = 0;
288
break;
289
case 40 ... 71:
290
reg->lowmant = reg->mant.m32[0] >> (cnt - 40);
291
if ((reg->mant.m32[0] << (72 - cnt)) || reg->mant.m32[1])
292
reg->lowmant |= 1;
293
reg->mant.m32[1] = reg->mant.m32[0] >> (cnt - 32);
294
reg->mant.m32[0] = 0;
295
break;
296
default:
297
reg->lowmant = reg->mant.m32[0] || reg->mant.m32[1];
298
reg->mant.m32[0] = 0;
299
reg->mant.m32[1] = 0;
300
break;
301
}
302
}
303
304
static inline int fp_overnormalize(struct fp_ext *reg)
305
{
306
int shift;
307
308
if (reg->mant.m32[0]) {
309
asm ("bfffo %1{#0,#32},%0" : "=d" (shift) : "dm" (reg->mant.m32[0]));
310
reg->mant.m32[0] = (reg->mant.m32[0] << shift) | (reg->mant.m32[1] >> (32 - shift));
311
reg->mant.m32[1] = (reg->mant.m32[1] << shift);
312
} else {
313
asm ("bfffo %1{#0,#32},%0" : "=d" (shift) : "dm" (reg->mant.m32[1]));
314
reg->mant.m32[0] = (reg->mant.m32[1] << shift);
315
reg->mant.m32[1] = 0;
316
shift += 32;
317
}
318
319
return shift;
320
}
321
322
static inline int fp_addmant(struct fp_ext *dest, struct fp_ext *src)
323
{
324
int carry;
325
326
/* we assume here, gcc only insert move and a clr instr */
327
asm volatile ("add.b %1,%0" : "=d,g" (dest->lowmant)
328
: "g,d" (src->lowmant), "0,0" (dest->lowmant));
329
asm volatile ("addx.l %1,%0" : "=d" (dest->mant.m32[1])
330
: "d" (src->mant.m32[1]), "0" (dest->mant.m32[1]));
331
asm volatile ("addx.l %1,%0" : "=d" (dest->mant.m32[0])
332
: "d" (src->mant.m32[0]), "0" (dest->mant.m32[0]));
333
asm volatile ("addx.l %0,%0" : "=d" (carry) : "0" (0));
334
335
return carry;
336
}
337
338
static inline int fp_addcarry(struct fp_ext *reg)
339
{
340
if (++reg->exp == 0x7fff) {
341
if (reg->mant.m64)
342
fp_set_sr(FPSR_EXC_INEX2);
343
reg->mant.m64 = 0;
344
fp_set_sr(FPSR_EXC_OVFL);
345
return 0;
346
}
347
reg->lowmant = (reg->mant.m32[1] << 7) | (reg->lowmant ? 1 : 0);
348
reg->mant.m32[1] = (reg->mant.m32[1] >> 1) |
349
(reg->mant.m32[0] << 31);
350
reg->mant.m32[0] = (reg->mant.m32[0] >> 1) | 0x80000000;
351
352
return 1;
353
}
354
355
static inline void fp_submant(struct fp_ext *dest, struct fp_ext *src1,
356
struct fp_ext *src2)
357
{
358
/* we assume here, gcc only insert move and a clr instr */
359
asm volatile ("sub.b %1,%0" : "=d,g" (dest->lowmant)
360
: "g,d" (src2->lowmant), "0,0" (src1->lowmant));
361
asm volatile ("subx.l %1,%0" : "=d" (dest->mant.m32[1])
362
: "d" (src2->mant.m32[1]), "0" (src1->mant.m32[1]));
363
asm volatile ("subx.l %1,%0" : "=d" (dest->mant.m32[0])
364
: "d" (src2->mant.m32[0]), "0" (src1->mant.m32[0]));
365
}
366
367
#define fp_mul64(desth, destl, src1, src2) ({ \
368
asm ("mulu.l %2,%1:%0" : "=d" (destl), "=d" (desth) \
369
: "dm" (src1), "0" (src2)); \
370
})
371
#define fp_div64(quot, rem, srch, srcl, div) \
372
asm ("divu.l %2,%1:%0" : "=d" (quot), "=d" (rem) \
373
: "dm" (div), "1" (srch), "0" (srcl))
374
#define fp_add64(dest1, dest2, src1, src2) ({ \
375
asm ("add.l %1,%0" : "=d,dm" (dest2) \
376
: "dm,d" (src2), "0,0" (dest2)); \
377
asm ("addx.l %1,%0" : "=d" (dest1) \
378
: "d" (src1), "0" (dest1)); \
379
})
380
#define fp_addx96(dest, src) ({ \
381
/* we assume here, gcc only insert move and a clr instr */ \
382
asm volatile ("add.l %1,%0" : "=d,g" (dest->m32[2]) \
383
: "g,d" (temp.m32[1]), "0,0" (dest->m32[2])); \
384
asm volatile ("addx.l %1,%0" : "=d" (dest->m32[1]) \
385
: "d" (temp.m32[0]), "0" (dest->m32[1])); \
386
asm volatile ("addx.l %1,%0" : "=d" (dest->m32[0]) \
387
: "d" (0), "0" (dest->m32[0])); \
388
})
389
#define fp_sub64(dest, src) ({ \
390
asm ("sub.l %1,%0" : "=d,dm" (dest.m32[1]) \
391
: "dm,d" (src.m32[1]), "0,0" (dest.m32[1])); \
392
asm ("subx.l %1,%0" : "=d" (dest.m32[0]) \
393
: "d" (src.m32[0]), "0" (dest.m32[0])); \
394
})
395
#define fp_sub96c(dest, srch, srcm, srcl) ({ \
396
char carry; \
397
asm ("sub.l %1,%0" : "=d,dm" (dest.m32[2]) \
398
: "dm,d" (srcl), "0,0" (dest.m32[2])); \
399
asm ("subx.l %1,%0" : "=d" (dest.m32[1]) \
400
: "d" (srcm), "0" (dest.m32[1])); \
401
asm ("subx.l %2,%1; scs %0" : "=d" (carry), "=d" (dest.m32[0]) \
402
: "d" (srch), "1" (dest.m32[0])); \
403
carry; \
404
})
405
406
static inline void fp_multiplymant(union fp_mant128 *dest, struct fp_ext *src1,
407
struct fp_ext *src2)
408
{
409
union fp_mant64 temp;
410
411
fp_mul64(dest->m32[0], dest->m32[1], src1->mant.m32[0], src2->mant.m32[0]);
412
fp_mul64(dest->m32[2], dest->m32[3], src1->mant.m32[1], src2->mant.m32[1]);
413
414
fp_mul64(temp.m32[0], temp.m32[1], src1->mant.m32[0], src2->mant.m32[1]);
415
fp_addx96(dest, temp);
416
417
fp_mul64(temp.m32[0], temp.m32[1], src1->mant.m32[1], src2->mant.m32[0]);
418
fp_addx96(dest, temp);
419
}
420
421
static inline void fp_dividemant(union fp_mant128 *dest, struct fp_ext *src,
422
struct fp_ext *div)
423
{
424
union fp_mant128 tmp;
425
union fp_mant64 tmp64;
426
unsigned long *mantp = dest->m32;
427
unsigned long fix, rem, first, dummy;
428
int i;
429
430
/* the algorithm below requires dest to be smaller than div,
431
but both have the high bit set */
432
if (src->mant.m64 >= div->mant.m64) {
433
fp_sub64(src->mant, div->mant);
434
*mantp = 1;
435
} else
436
*mantp = 0;
437
mantp++;
438
439
/* basic idea behind this algorithm: we can't divide two 64bit numbers
440
(AB/CD) directly, but we can calculate AB/C0, but this means this
441
quotient is off by C0/CD, so we have to multiply the first result
442
to fix the result, after that we have nearly the correct result
443
and only a few corrections are needed. */
444
445
/* C0/CD can be precalculated, but it's an 64bit division again, but
446
we can make it a bit easier, by dividing first through C so we get
447
10/1D and now only a single shift and the value fits into 32bit. */
448
fix = 0x80000000;
449
dummy = div->mant.m32[1] / div->mant.m32[0] + 1;
450
dummy = (dummy >> 1) | fix;
451
fp_div64(fix, dummy, fix, 0, dummy);
452
fix--;
453
454
for (i = 0; i < 3; i++, mantp++) {
455
if (src->mant.m32[0] == div->mant.m32[0]) {
456
fp_div64(first, rem, 0, src->mant.m32[1], div->mant.m32[0]);
457
458
fp_mul64(*mantp, dummy, first, fix);
459
*mantp += fix;
460
} else {
461
fp_div64(first, rem, src->mant.m32[0], src->mant.m32[1], div->mant.m32[0]);
462
463
fp_mul64(*mantp, dummy, first, fix);
464
}
465
466
fp_mul64(tmp.m32[0], tmp.m32[1], div->mant.m32[0], first - *mantp);
467
fp_add64(tmp.m32[0], tmp.m32[1], 0, rem);
468
tmp.m32[2] = 0;
469
470
fp_mul64(tmp64.m32[0], tmp64.m32[1], *mantp, div->mant.m32[1]);
471
fp_sub96c(tmp, 0, tmp64.m32[0], tmp64.m32[1]);
472
473
src->mant.m32[0] = tmp.m32[1];
474
src->mant.m32[1] = tmp.m32[2];
475
476
while (!fp_sub96c(tmp, 0, div->mant.m32[0], div->mant.m32[1])) {
477
src->mant.m32[0] = tmp.m32[1];
478
src->mant.m32[1] = tmp.m32[2];
479
*mantp += 1;
480
}
481
}
482
}
483
484
#if 0
485
static inline unsigned int fp_fls128(union fp_mant128 *src)
486
{
487
unsigned long data;
488
unsigned int res, off;
489
490
if ((data = src->m32[0]))
491
off = 0;
492
else if ((data = src->m32[1]))
493
off = 32;
494
else if ((data = src->m32[2]))
495
off = 64;
496
else if ((data = src->m32[3]))
497
off = 96;
498
else
499
return 128;
500
501
asm ("bfffo %1{#0,#32},%0" : "=d" (res) : "dm" (data));
502
return res + off;
503
}
504
505
static inline void fp_shiftmant128(union fp_mant128 *src, int shift)
506
{
507
unsigned long sticky;
508
509
switch (shift) {
510
case 0:
511
return;
512
case 1:
513
asm volatile ("lsl.l #1,%0"
514
: "=d" (src->m32[3]) : "0" (src->m32[3]));
515
asm volatile ("roxl.l #1,%0"
516
: "=d" (src->m32[2]) : "0" (src->m32[2]));
517
asm volatile ("roxl.l #1,%0"
518
: "=d" (src->m32[1]) : "0" (src->m32[1]));
519
asm volatile ("roxl.l #1,%0"
520
: "=d" (src->m32[0]) : "0" (src->m32[0]));
521
return;
522
case 2 ... 31:
523
src->m32[0] = (src->m32[0] << shift) | (src->m32[1] >> (32 - shift));
524
src->m32[1] = (src->m32[1] << shift) | (src->m32[2] >> (32 - shift));
525
src->m32[2] = (src->m32[2] << shift) | (src->m32[3] >> (32 - shift));
526
src->m32[3] = (src->m32[3] << shift);
527
return;
528
case 32 ... 63:
529
shift -= 32;
530
src->m32[0] = (src->m32[1] << shift) | (src->m32[2] >> (32 - shift));
531
src->m32[1] = (src->m32[2] << shift) | (src->m32[3] >> (32 - shift));
532
src->m32[2] = (src->m32[3] << shift);
533
src->m32[3] = 0;
534
return;
535
case 64 ... 95:
536
shift -= 64;
537
src->m32[0] = (src->m32[2] << shift) | (src->m32[3] >> (32 - shift));
538
src->m32[1] = (src->m32[3] << shift);
539
src->m32[2] = src->m32[3] = 0;
540
return;
541
case 96 ... 127:
542
shift -= 96;
543
src->m32[0] = (src->m32[3] << shift);
544
src->m32[1] = src->m32[2] = src->m32[3] = 0;
545
return;
546
case -31 ... -1:
547
shift = -shift;
548
sticky = 0;
549
if (src->m32[3] << (32 - shift))
550
sticky = 1;
551
src->m32[3] = (src->m32[3] >> shift) | (src->m32[2] << (32 - shift)) | sticky;
552
src->m32[2] = (src->m32[2] >> shift) | (src->m32[1] << (32 - shift));
553
src->m32[1] = (src->m32[1] >> shift) | (src->m32[0] << (32 - shift));
554
src->m32[0] = (src->m32[0] >> shift);
555
return;
556
case -63 ... -32:
557
shift = -shift - 32;
558
sticky = 0;
559
if ((src->m32[2] << (32 - shift)) || src->m32[3])
560
sticky = 1;
561
src->m32[3] = (src->m32[2] >> shift) | (src->m32[1] << (32 - shift)) | sticky;
562
src->m32[2] = (src->m32[1] >> shift) | (src->m32[0] << (32 - shift));
563
src->m32[1] = (src->m32[0] >> shift);
564
src->m32[0] = 0;
565
return;
566
case -95 ... -64:
567
shift = -shift - 64;
568
sticky = 0;
569
if ((src->m32[1] << (32 - shift)) || src->m32[2] || src->m32[3])
570
sticky = 1;
571
src->m32[3] = (src->m32[1] >> shift) | (src->m32[0] << (32 - shift)) | sticky;
572
src->m32[2] = (src->m32[0] >> shift);
573
src->m32[1] = src->m32[0] = 0;
574
return;
575
case -127 ... -96:
576
shift = -shift - 96;
577
sticky = 0;
578
if ((src->m32[0] << (32 - shift)) || src->m32[1] || src->m32[2] || src->m32[3])
579
sticky = 1;
580
src->m32[3] = (src->m32[0] >> shift) | sticky;
581
src->m32[2] = src->m32[1] = src->m32[0] = 0;
582
return;
583
}
584
585
if (shift < 0 && (src->m32[0] || src->m32[1] || src->m32[2] || src->m32[3]))
586
src->m32[3] = 1;
587
else
588
src->m32[3] = 0;
589
src->m32[2] = 0;
590
src->m32[1] = 0;
591
src->m32[0] = 0;
592
}
593
#endif
594
595
static inline void fp_putmant128(struct fp_ext *dest, union fp_mant128 *src,
596
int shift)
597
{
598
unsigned long tmp;
599
600
switch (shift) {
601
case 0:
602
dest->mant.m64 = src->m64[0];
603
dest->lowmant = src->m32[2] >> 24;
604
if (src->m32[3] || (src->m32[2] << 8))
605
dest->lowmant |= 1;
606
break;
607
case 1:
608
asm volatile ("lsl.l #1,%0"
609
: "=d" (tmp) : "0" (src->m32[2]));
610
asm volatile ("roxl.l #1,%0"
611
: "=d" (dest->mant.m32[1]) : "0" (src->m32[1]));
612
asm volatile ("roxl.l #1,%0"
613
: "=d" (dest->mant.m32[0]) : "0" (src->m32[0]));
614
dest->lowmant = tmp >> 24;
615
if (src->m32[3] || (tmp << 8))
616
dest->lowmant |= 1;
617
break;
618
case 31:
619
asm volatile ("lsr.l #1,%1; roxr.l #1,%0"
620
: "=d" (dest->mant.m32[0])
621
: "d" (src->m32[0]), "0" (src->m32[1]));
622
asm volatile ("roxr.l #1,%0"
623
: "=d" (dest->mant.m32[1]) : "0" (src->m32[2]));
624
asm volatile ("roxr.l #1,%0"
625
: "=d" (tmp) : "0" (src->m32[3]));
626
dest->lowmant = tmp >> 24;
627
if (src->m32[3] << 7)
628
dest->lowmant |= 1;
629
break;
630
case 32:
631
dest->mant.m32[0] = src->m32[1];
632
dest->mant.m32[1] = src->m32[2];
633
dest->lowmant = src->m32[3] >> 24;
634
if (src->m32[3] << 8)
635
dest->lowmant |= 1;
636
break;
637
}
638
}
639
640
#if 0 /* old code... */
641
static inline int fls(unsigned int a)
642
{
643
int r;
644
645
asm volatile ("bfffo %1{#0,#32},%0"
646
: "=d" (r) : "md" (a));
647
return r;
648
}
649
650
/* fls = "find last set" (cf. ffs(3)) */
651
static inline int fls128(const int128 a)
652
{
653
if (a[MSW128])
654
return fls(a[MSW128]);
655
if (a[NMSW128])
656
return fls(a[NMSW128]) + 32;
657
/* XXX: it probably never gets beyond this point in actual
658
use, but that's indicative of a more general problem in the
659
algorithm (i.e. as per the actual 68881 implementation, we
660
really only need at most 67 bits of precision [plus
661
overflow]) so I'm not going to fix it. */
662
if (a[NLSW128])
663
return fls(a[NLSW128]) + 64;
664
if (a[LSW128])
665
return fls(a[LSW128]) + 96;
666
else
667
return -1;
668
}
669
670
static inline int zerop128(const int128 a)
671
{
672
return !(a[LSW128] | a[NLSW128] | a[NMSW128] | a[MSW128]);
673
}
674
675
static inline int nonzerop128(const int128 a)
676
{
677
return (a[LSW128] | a[NLSW128] | a[NMSW128] | a[MSW128]);
678
}
679
680
/* Addition and subtraction */
681
/* Do these in "pure" assembly, because "extended" asm is unmanageable
682
here */
683
static inline void add128(const int128 a, int128 b)
684
{
685
/* rotating carry flags */
686
unsigned int carry[2];
687
688
carry[0] = a[LSW128] > (0xffffffff - b[LSW128]);
689
b[LSW128] += a[LSW128];
690
691
carry[1] = a[NLSW128] > (0xffffffff - b[NLSW128] - carry[0]);
692
b[NLSW128] = a[NLSW128] + b[NLSW128] + carry[0];
693
694
carry[0] = a[NMSW128] > (0xffffffff - b[NMSW128] - carry[1]);
695
b[NMSW128] = a[NMSW128] + b[NMSW128] + carry[1];
696
697
b[MSW128] = a[MSW128] + b[MSW128] + carry[0];
698
}
699
700
/* Note: assembler semantics: "b -= a" */
701
static inline void sub128(const int128 a, int128 b)
702
{
703
/* rotating borrow flags */
704
unsigned int borrow[2];
705
706
borrow[0] = b[LSW128] < a[LSW128];
707
b[LSW128] -= a[LSW128];
708
709
borrow[1] = b[NLSW128] < a[NLSW128] + borrow[0];
710
b[NLSW128] = b[NLSW128] - a[NLSW128] - borrow[0];
711
712
borrow[0] = b[NMSW128] < a[NMSW128] + borrow[1];
713
b[NMSW128] = b[NMSW128] - a[NMSW128] - borrow[1];
714
715
b[MSW128] = b[MSW128] - a[MSW128] - borrow[0];
716
}
717
718
/* Poor man's 64-bit expanding multiply */
719
static inline void mul64(unsigned long long a, unsigned long long b, int128 c)
720
{
721
unsigned long long acc;
722
int128 acc128;
723
724
zero128(acc128);
725
zero128(c);
726
727
/* first the low words */
728
if (LO_WORD(a) && LO_WORD(b)) {
729
acc = (long long) LO_WORD(a) * LO_WORD(b);
730
c[NLSW128] = HI_WORD(acc);
731
c[LSW128] = LO_WORD(acc);
732
}
733
/* Next the high words */
734
if (HI_WORD(a) && HI_WORD(b)) {
735
acc = (long long) HI_WORD(a) * HI_WORD(b);
736
c[MSW128] = HI_WORD(acc);
737
c[NMSW128] = LO_WORD(acc);
738
}
739
/* The middle words */
740
if (LO_WORD(a) && HI_WORD(b)) {
741
acc = (long long) LO_WORD(a) * HI_WORD(b);
742
acc128[NMSW128] = HI_WORD(acc);
743
acc128[NLSW128] = LO_WORD(acc);
744
add128(acc128, c);
745
}
746
/* The first and last words */
747
if (HI_WORD(a) && LO_WORD(b)) {
748
acc = (long long) HI_WORD(a) * LO_WORD(b);
749
acc128[NMSW128] = HI_WORD(acc);
750
acc128[NLSW128] = LO_WORD(acc);
751
add128(acc128, c);
752
}
753
}
754
755
/* Note: unsigned */
756
static inline int cmp128(int128 a, int128 b)
757
{
758
if (a[MSW128] < b[MSW128])
759
return -1;
760
if (a[MSW128] > b[MSW128])
761
return 1;
762
if (a[NMSW128] < b[NMSW128])
763
return -1;
764
if (a[NMSW128] > b[NMSW128])
765
return 1;
766
if (a[NLSW128] < b[NLSW128])
767
return -1;
768
if (a[NLSW128] > b[NLSW128])
769
return 1;
770
771
return (signed) a[LSW128] - b[LSW128];
772
}
773
774
inline void div128(int128 a, int128 b, int128 c)
775
{
776
int128 mask;
777
778
/* Algorithm:
779
780
Shift the divisor until it's at least as big as the
781
dividend, keeping track of the position to which we've
782
shifted it, i.e. the power of 2 which we've multiplied it
783
by.
784
785
Then, for this power of 2 (the mask), and every one smaller
786
than it, subtract the mask from the dividend and add it to
787
the quotient until the dividend is smaller than the raised
788
divisor. At this point, divide the dividend and the mask
789
by 2 (i.e. shift one place to the right). Lather, rinse,
790
and repeat, until there are no more powers of 2 left. */
791
792
/* FIXME: needless to say, there's room for improvement here too. */
793
794
/* Shift up */
795
/* XXX: since it just has to be "at least as big", we can
796
probably eliminate this horribly wasteful loop. I will
797
have to prove this first, though */
798
set128(0, 0, 0, 1, mask);
799
while (cmp128(b, a) < 0 && !btsthi128(b)) {
800
lslone128(b);
801
lslone128(mask);
802
}
803
804
/* Shift down */
805
zero128(c);
806
do {
807
if (cmp128(a, b) >= 0) {
808
sub128(b, a);
809
add128(mask, c);
810
}
811
lsrone128(mask);
812
lsrone128(b);
813
} while (nonzerop128(mask));
814
815
/* The remainder is in a... */
816
}
817
#endif
818
819
#endif /* MULTI_ARITH_H */
820
821