CoCalc -- multi

GitHub Repository: awilliam/linux-vfio
Path: blob/master/arch/m68k/math-emu/multi_arith.h
¹⁰⁸¹⁷ views
1
/* multi_arith.h: multi-precision integer arithmetic functions, needed
2
   to do extended-precision floating point.
3

4
   (c) 1998 David Huggins-Daines.
5

6
   Somewhat based on arch/alpha/math-emu/ieee-math.c, which is (c)
7
   David Mosberger-Tang.
8

9
   You may copy, modify, and redistribute this file under the terms of
10
   the GNU General Public License, version 2, or any later version, at
11
   your convenience. */
12

13
/* Note:
14

15
   These are not general multi-precision math routines.  Rather, they
16
   implement the subset of integer arithmetic that we need in order to
17
   multiply, divide, and normalize 128-bit unsigned mantissae.  */
18

19
#ifndef MULTI_ARITH_H
20
#define MULTI_ARITH_H
21

22
#if 0	/* old code... */
23

24
/* Unsigned only, because we don't need signs to multiply and divide. */
25
typedef unsigned int int128[4];
26

27
/* Word order */
28
enum {
29
	MSW128,
30
	NMSW128,
31
	NLSW128,
32
	LSW128
33
};
34

35
/* big-endian */
36
#define LO_WORD(ll) (((unsigned int *) &ll)[1])
37
#define HI_WORD(ll) (((unsigned int *) &ll)[0])
38

39
/* Convenience functions to stuff various integer values into int128s */
40

41
static inline void zero128(int128 a)
42
{
43
	a[LSW128] = a[NLSW128] = a[NMSW128] = a[MSW128] = 0;
44
}
45

46
/* Human-readable word order in the arguments */
47
static inline void set128(unsigned int i3, unsigned int i2, unsigned int i1,
48
			  unsigned int i0, int128 a)
49
{
50
	a[LSW128] = i0;
51
	a[NLSW128] = i1;
52
	a[NMSW128] = i2;
53
	a[MSW128] = i3;
54
}
55

56
/* Convenience functions (for testing as well) */
57
static inline void int64_to_128(unsigned long long src, int128 dest)
58
{
59
	dest[LSW128] = (unsigned int) src;
60
	dest[NLSW128] = src >> 32;
61
	dest[NMSW128] = dest[MSW128] = 0;
62
}
63

64
static inline void int128_to_64(const int128 src, unsigned long long *dest)
65
{
66
	*dest = src[LSW128] | (long long) src[NLSW128] << 32;
67
}
68

69
static inline void put_i128(const int128 a)
70
{
71
	printk("%08x %08x %08x %08x\n", a[MSW128], a[NMSW128],
72
	       a[NLSW128], a[LSW128]);
73
}
74

75
/* Internal shifters:
76

77
   Note that these are only good for 0 < count < 32.
78
 */
79

80
static inline void _lsl128(unsigned int count, int128 a)
81
{
82
	a[MSW128] = (a[MSW128] << count) | (a[NMSW128] >> (32 - count));
83
	a[NMSW128] = (a[NMSW128] << count) | (a[NLSW128] >> (32 - count));
84
	a[NLSW128] = (a[NLSW128] << count) | (a[LSW128] >> (32 - count));
85
	a[LSW128] <<= count;
86
}
87

88
static inline void _lsr128(unsigned int count, int128 a)
89
{
90
	a[LSW128] = (a[LSW128] >> count) | (a[NLSW128] << (32 - count));
91
	a[NLSW128] = (a[NLSW128] >> count) | (a[NMSW128] << (32 - count));
92
	a[NMSW128] = (a[NMSW128] >> count) | (a[MSW128] << (32 - count));
93
	a[MSW128] >>= count;
94
}
95

96
/* Should be faster, one would hope */
97

98
static inline void lslone128(int128 a)
99
{
100
	asm volatile ("lsl.l #1,%0\n"
101
		      "roxl.l #1,%1\n"
102
		      "roxl.l #1,%2\n"
103
		      "roxl.l #1,%3\n"
104
		      :
105
		      "=d" (a[LSW128]),
106
		      "=d"(a[NLSW128]),
107
		      "=d"(a[NMSW128]),
108
		      "=d"(a[MSW128])
109
		      :
110
		      "0"(a[LSW128]),
111
		      "1"(a[NLSW128]),
112
		      "2"(a[NMSW128]),
113
		      "3"(a[MSW128]));
114
}
115

116
static inline void lsrone128(int128 a)
117
{
118
	asm volatile ("lsr.l #1,%0\n"
119
		      "roxr.l #1,%1\n"
120
		      "roxr.l #1,%2\n"
121
		      "roxr.l #1,%3\n"
122
		      :
123
		      "=d" (a[MSW128]),
124
		      "=d"(a[NMSW128]),
125
		      "=d"(a[NLSW128]),
126
		      "=d"(a[LSW128])
127
		      :
128
		      "0"(a[MSW128]),
129
		      "1"(a[NMSW128]),
130
		      "2"(a[NLSW128]),
131
		      "3"(a[LSW128]));
132
}
133

134
/* Generalized 128-bit shifters:
135

136
   These bit-shift to a multiple of 32, then move whole longwords.  */
137

138
static inline void lsl128(unsigned int count, int128 a)
139
{
140
	int wordcount, i;
141

142
	if (count % 32)
143
		_lsl128(count % 32, a);
144

145
	if (0 == (wordcount = count / 32))
146
		return;
147

148
	/* argh, gak, endian-sensitive */
149
	for (i = 0; i < 4 - wordcount; i++) {
150
		a[i] = a[i + wordcount];
151
	}
152
	for (i = 3; i >= 4 - wordcount; --i) {
153
		a[i] = 0;
154
	}
155
}
156

157
static inline void lsr128(unsigned int count, int128 a)
158
{
159
	int wordcount, i;
160

161
	if (count % 32)
162
		_lsr128(count % 32, a);
163

164
	if (0 == (wordcount = count / 32))
165
		return;
166

167
	for (i = 3; i >= wordcount; --i) {
168
		a[i] = a[i - wordcount];
169
	}
170
	for (i = 0; i < wordcount; i++) {
171
		a[i] = 0;
172
	}
173
}
174

175
static inline int orl128(int a, int128 b)
176
{
177
	b[LSW128] |= a;
178
}
179

180
static inline int btsthi128(const int128 a)
181
{
182
	return a[MSW128] & 0x80000000;
183
}
184

185
/* test bits (numbered from 0 = LSB) up to and including "top" */
186
static inline int bftestlo128(int top, const int128 a)
187
{
188
	int r = 0;
189

190
	if (top > 31)
191
		r |= a[LSW128];
192
	if (top > 63)
193
		r |= a[NLSW128];
194
	if (top > 95)
195
		r |= a[NMSW128];
196

197
	r |= a[3 - (top / 32)] & ((1 << (top % 32 + 1)) - 1);
198

199
	return (r != 0);
200
}
201

202
/* Aargh.  We need these because GCC is broken */
203
/* FIXME: do them in assembly, for goodness' sake! */
204
static inline void mask64(int pos, unsigned long long *mask)
205
{
206
	*mask = 0;
207

208
	if (pos < 32) {
209
		LO_WORD(*mask) = (1 << pos) - 1;
210
		return;
211
	}
212
	LO_WORD(*mask) = -1;
213
	HI_WORD(*mask) = (1 << (pos - 32)) - 1;
214
}
215

216
static inline void bset64(int pos, unsigned long long *dest)
217
{
218
	/* This conditional will be optimized away.  Thanks, GCC! */
219
	if (pos < 32)
220
		asm volatile ("bset %1,%0":"=m"
221
			      (LO_WORD(*dest)):"id"(pos));
222
	else
223
		asm volatile ("bset %1,%0":"=m"
224
			      (HI_WORD(*dest)):"id"(pos - 32));
225
}
226

227
static inline int btst64(int pos, unsigned long long dest)
228
{
229
	if (pos < 32)
230
		return (0 != (LO_WORD(dest) & (1 << pos)));
231
	else
232
		return (0 != (HI_WORD(dest) & (1 << (pos - 32))));
233
}
234

235
static inline void lsl64(int count, unsigned long long *dest)
236
{
237
	if (count < 32) {
238
		HI_WORD(*dest) = (HI_WORD(*dest) << count)
239
		    | (LO_WORD(*dest) >> count);
240
		LO_WORD(*dest) <<= count;
241
		return;
242
	}
243
	count -= 32;
244
	HI_WORD(*dest) = LO_WORD(*dest) << count;
245
	LO_WORD(*dest) = 0;
246
}
247

248
static inline void lsr64(int count, unsigned long long *dest)
249
{
250
	if (count < 32) {
251
		LO_WORD(*dest) = (LO_WORD(*dest) >> count)
252
		    | (HI_WORD(*dest) << (32 - count));
253
		HI_WORD(*dest) >>= count;
254
		return;
255
	}
256
	count -= 32;
257
	LO_WORD(*dest) = HI_WORD(*dest) >> count;
258
	HI_WORD(*dest) = 0;
259
}
260
#endif
261

262
static inline void fp_denormalize(struct fp_ext *reg, unsigned int cnt)
263
{
264
	reg->exp += cnt;
265

266
	switch (cnt) {
267
	case 0 ... 8:
268
		reg->lowmant = reg->mant.m32[1] << (8 - cnt);
269
		reg->mant.m32[1] = (reg->mant.m32[1] >> cnt) |
270
				   (reg->mant.m32[0] << (32 - cnt));
271
		reg->mant.m32[0] = reg->mant.m32[0] >> cnt;
272
		break;
273
	case 9 ... 32:
274
		reg->lowmant = reg->mant.m32[1] >> (cnt - 8);
275
		if (reg->mant.m32[1] << (40 - cnt))
276
			reg->lowmant |= 1;
277
		reg->mant.m32[1] = (reg->mant.m32[1] >> cnt) |
278
				   (reg->mant.m32[0] << (32 - cnt));
279
		reg->mant.m32[0] = reg->mant.m32[0] >> cnt;
280
		break;
281
	case 33 ... 39:
282
		asm volatile ("bfextu %1{%2,#8},%0" : "=d" (reg->lowmant)
283
			: "m" (reg->mant.m32[0]), "d" (64 - cnt));
284
		if (reg->mant.m32[1] << (40 - cnt))
285
			reg->lowmant |= 1;
286
		reg->mant.m32[1] = reg->mant.m32[0] >> (cnt - 32);
287
		reg->mant.m32[0] = 0;
288
		break;
289
	case 40 ... 71:
290
		reg->lowmant = reg->mant.m32[0] >> (cnt - 40);
291
		if ((reg->mant.m32[0] << (72 - cnt)) || reg->mant.m32[1])
292
			reg->lowmant |= 1;
293
		reg->mant.m32[1] = reg->mant.m32[0] >> (cnt - 32);
294
		reg->mant.m32[0] = 0;
295
		break;
296
	default:
297
		reg->lowmant = reg->mant.m32[0] || reg->mant.m32[1];
298
		reg->mant.m32[0] = 0;
299
		reg->mant.m32[1] = 0;
300
		break;
301
	}
302
}
303

304
static inline int fp_overnormalize(struct fp_ext *reg)
305
{
306
	int shift;
307

308
	if (reg->mant.m32[0]) {
309
		asm ("bfffo %1{#0,#32},%0" : "=d" (shift) : "dm" (reg->mant.m32[0]));
310
		reg->mant.m32[0] = (reg->mant.m32[0] << shift) | (reg->mant.m32[1] >> (32 - shift));
311
		reg->mant.m32[1] = (reg->mant.m32[1] << shift);
312
	} else {
313
		asm ("bfffo %1{#0,#32},%0" : "=d" (shift) : "dm" (reg->mant.m32[1]));
314
		reg->mant.m32[0] = (reg->mant.m32[1] << shift);
315
		reg->mant.m32[1] = 0;
316
		shift += 32;
317
	}
318

319
	return shift;
320
}
321

322
static inline int fp_addmant(struct fp_ext *dest, struct fp_ext *src)
323
{
324
	int carry;
325

326
	/* we assume here, gcc only insert move and a clr instr */
327
	asm volatile ("add.b %1,%0" : "=d,g" (dest->lowmant)
328
		: "g,d" (src->lowmant), "0,0" (dest->lowmant));
329
	asm volatile ("addx.l %1,%0" : "=d" (dest->mant.m32[1])
330
		: "d" (src->mant.m32[1]), "0" (dest->mant.m32[1]));
331
	asm volatile ("addx.l %1,%0" : "=d" (dest->mant.m32[0])
332
		: "d" (src->mant.m32[0]), "0" (dest->mant.m32[0]));
333
	asm volatile ("addx.l %0,%0" : "=d" (carry) : "0" (0));
334

335
	return carry;
336
}
337

338
static inline int fp_addcarry(struct fp_ext *reg)
339
{
340
	if (++reg->exp == 0x7fff) {
341
		if (reg->mant.m64)
342
			fp_set_sr(FPSR_EXC_INEX2);
343
		reg->mant.m64 = 0;
344
		fp_set_sr(FPSR_EXC_OVFL);
345
		return 0;
346
	}
347
	reg->lowmant = (reg->mant.m32[1] << 7) | (reg->lowmant ? 1 : 0);
348
	reg->mant.m32[1] = (reg->mant.m32[1] >> 1) |
349
			   (reg->mant.m32[0] << 31);
350
	reg->mant.m32[0] = (reg->mant.m32[0] >> 1) | 0x80000000;
351

352
	return 1;
353
}
354

355
static inline void fp_submant(struct fp_ext *dest, struct fp_ext *src1,
356
			      struct fp_ext *src2)
357
{
358
	/* we assume here, gcc only insert move and a clr instr */
359
	asm volatile ("sub.b %1,%0" : "=d,g" (dest->lowmant)
360
		: "g,d" (src2->lowmant), "0,0" (src1->lowmant));
361
	asm volatile ("subx.l %1,%0" : "=d" (dest->mant.m32[1])
362
		: "d" (src2->mant.m32[1]), "0" (src1->mant.m32[1]));
363
	asm volatile ("subx.l %1,%0" : "=d" (dest->mant.m32[0])
364
		: "d" (src2->mant.m32[0]), "0" (src1->mant.m32[0]));
365
}
366

367
#define fp_mul64(desth, destl, src1, src2) ({				\
368
	asm ("mulu.l %2,%1:%0" : "=d" (destl), "=d" (desth)		\
369
		: "dm" (src1), "0" (src2));				\
370
})
371
#define fp_div64(quot, rem, srch, srcl, div)				\
372
	asm ("divu.l %2,%1:%0" : "=d" (quot), "=d" (rem)		\
373
		: "dm" (div), "1" (srch), "0" (srcl))
374
#define fp_add64(dest1, dest2, src1, src2) ({				\
375
	asm ("add.l %1,%0" : "=d,dm" (dest2)				\
376
		: "dm,d" (src2), "0,0" (dest2));			\
377
	asm ("addx.l %1,%0" : "=d" (dest1)				\
378
		: "d" (src1), "0" (dest1));				\
379
})
380
#define fp_addx96(dest, src) ({						\
381
	/* we assume here, gcc only insert move and a clr instr */	\
382
	asm volatile ("add.l %1,%0" : "=d,g" (dest->m32[2])		\
383
		: "g,d" (temp.m32[1]), "0,0" (dest->m32[2]));		\
384
	asm volatile ("addx.l %1,%0" : "=d" (dest->m32[1])		\
385
		: "d" (temp.m32[0]), "0" (dest->m32[1]));		\
386
	asm volatile ("addx.l %1,%0" : "=d" (dest->m32[0])		\
387
		: "d" (0), "0" (dest->m32[0]));				\
388
})
389
#define fp_sub64(dest, src) ({						\
390
	asm ("sub.l %1,%0" : "=d,dm" (dest.m32[1])			\
391
		: "dm,d" (src.m32[1]), "0,0" (dest.m32[1]));		\
392
	asm ("subx.l %1,%0" : "=d" (dest.m32[0])			\
393
		: "d" (src.m32[0]), "0" (dest.m32[0]));			\
394
})
395
#define fp_sub96c(dest, srch, srcm, srcl) ({				\
396
	char carry;							\
397
	asm ("sub.l %1,%0" : "=d,dm" (dest.m32[2])			\
398
		: "dm,d" (srcl), "0,0" (dest.m32[2]));			\
399
	asm ("subx.l %1,%0" : "=d" (dest.m32[1])			\
400
		: "d" (srcm), "0" (dest.m32[1]));			\
401
	asm ("subx.l %2,%1; scs %0" : "=d" (carry), "=d" (dest.m32[0])	\
402
		: "d" (srch), "1" (dest.m32[0]));			\
403
	carry;								\
404
})
405

406
static inline void fp_multiplymant(union fp_mant128 *dest, struct fp_ext *src1,
407
				   struct fp_ext *src2)
408
{
409
	union fp_mant64 temp;
410

411
	fp_mul64(dest->m32[0], dest->m32[1], src1->mant.m32[0], src2->mant.m32[0]);
412
	fp_mul64(dest->m32[2], dest->m32[3], src1->mant.m32[1], src2->mant.m32[1]);
413

414
	fp_mul64(temp.m32[0], temp.m32[1], src1->mant.m32[0], src2->mant.m32[1]);
415
	fp_addx96(dest, temp);
416

417
	fp_mul64(temp.m32[0], temp.m32[1], src1->mant.m32[1], src2->mant.m32[0]);
418
	fp_addx96(dest, temp);
419
}
420

421
static inline void fp_dividemant(union fp_mant128 *dest, struct fp_ext *src,
422
				 struct fp_ext *div)
423
{
424
	union fp_mant128 tmp;
425
	union fp_mant64 tmp64;
426
	unsigned long *mantp = dest->m32;
427
	unsigned long fix, rem, first, dummy;
428
	int i;
429

430
	/* the algorithm below requires dest to be smaller than div,
431
	   but both have the high bit set */
432
	if (src->mant.m64 >= div->mant.m64) {
433
		fp_sub64(src->mant, div->mant);
434
		*mantp = 1;
435
	} else
436
		*mantp = 0;
437
	mantp++;
438

439
	/* basic idea behind this algorithm: we can't divide two 64bit numbers
440
	   (AB/CD) directly, but we can calculate AB/C0, but this means this
441
	   quotient is off by C0/CD, so we have to multiply the first result
442
	   to fix the result, after that we have nearly the correct result
443
	   and only a few corrections are needed. */
444

445
	/* C0/CD can be precalculated, but it's an 64bit division again, but
446
	   we can make it a bit easier, by dividing first through C so we get
447
	   10/1D and now only a single shift and the value fits into 32bit. */
448
	fix = 0x80000000;
449
	dummy = div->mant.m32[1] / div->mant.m32[0] + 1;
450
	dummy = (dummy >> 1) | fix;
451
	fp_div64(fix, dummy, fix, 0, dummy);
452
	fix--;
453

454
	for (i = 0; i < 3; i++, mantp++) {
455
		if (src->mant.m32[0] == div->mant.m32[0]) {
456
			fp_div64(first, rem, 0, src->mant.m32[1], div->mant.m32[0]);
457

458
			fp_mul64(*mantp, dummy, first, fix);
459
			*mantp += fix;
460
		} else {
461
			fp_div64(first, rem, src->mant.m32[0], src->mant.m32[1], div->mant.m32[0]);
462

463
			fp_mul64(*mantp, dummy, first, fix);
464
		}
465

466
		fp_mul64(tmp.m32[0], tmp.m32[1], div->mant.m32[0], first - *mantp);
467
		fp_add64(tmp.m32[0], tmp.m32[1], 0, rem);
468
		tmp.m32[2] = 0;
469

470
		fp_mul64(tmp64.m32[0], tmp64.m32[1], *mantp, div->mant.m32[1]);
471
		fp_sub96c(tmp, 0, tmp64.m32[0], tmp64.m32[1]);
472

473
		src->mant.m32[0] = tmp.m32[1];
474
		src->mant.m32[1] = tmp.m32[2];
475

476
		while (!fp_sub96c(tmp, 0, div->mant.m32[0], div->mant.m32[1])) {
477
			src->mant.m32[0] = tmp.m32[1];
478
			src->mant.m32[1] = tmp.m32[2];
479
			*mantp += 1;
480
		}
481
	}
482
}
483

484
#if 0
485
static inline unsigned int fp_fls128(union fp_mant128 *src)
486
{
487
	unsigned long data;
488
	unsigned int res, off;
489

490
	if ((data = src->m32[0]))
491
		off = 0;
492
	else if ((data = src->m32[1]))
493
		off = 32;
494
	else if ((data = src->m32[2]))
495
		off = 64;
496
	else if ((data = src->m32[3]))
497
		off = 96;
498
	else
499
		return 128;
500

501
	asm ("bfffo %1{#0,#32},%0" : "=d" (res) : "dm" (data));
502
	return res + off;
503
}
504

505
static inline void fp_shiftmant128(union fp_mant128 *src, int shift)
506
{
507
	unsigned long sticky;
508

509
	switch (shift) {
510
	case 0:
511
		return;
512
	case 1:
513
		asm volatile ("lsl.l #1,%0"
514
			: "=d" (src->m32[3]) : "0" (src->m32[3]));
515
		asm volatile ("roxl.l #1,%0"
516
			: "=d" (src->m32[2]) : "0" (src->m32[2]));
517
		asm volatile ("roxl.l #1,%0"
518
			: "=d" (src->m32[1]) : "0" (src->m32[1]));
519
		asm volatile ("roxl.l #1,%0"
520
			: "=d" (src->m32[0]) : "0" (src->m32[0]));
521
		return;
522
	case 2 ... 31:
523
		src->m32[0] = (src->m32[0] << shift) | (src->m32[1] >> (32 - shift));
524
		src->m32[1] = (src->m32[1] << shift) | (src->m32[2] >> (32 - shift));
525
		src->m32[2] = (src->m32[2] << shift) | (src->m32[3] >> (32 - shift));
526
		src->m32[3] = (src->m32[3] << shift);
527
		return;
528
	case 32 ... 63:
529
		shift -= 32;
530
		src->m32[0] = (src->m32[1] << shift) | (src->m32[2] >> (32 - shift));
531
		src->m32[1] = (src->m32[2] << shift) | (src->m32[3] >> (32 - shift));
532
		src->m32[2] = (src->m32[3] << shift);
533
		src->m32[3] = 0;
534
		return;
535
	case 64 ... 95:
536
		shift -= 64;
537
		src->m32[0] = (src->m32[2] << shift) | (src->m32[3] >> (32 - shift));
538
		src->m32[1] = (src->m32[3] << shift);
539
		src->m32[2] = src->m32[3] = 0;
540
		return;
541
	case 96 ... 127:
542
		shift -= 96;
543
		src->m32[0] = (src->m32[3] << shift);
544
		src->m32[1] = src->m32[2] = src->m32[3] = 0;
545
		return;
546
	case -31 ... -1:
547
		shift = -shift;
548
		sticky = 0;
549
		if (src->m32[3] << (32 - shift))
550
			sticky = 1;
551
		src->m32[3] = (src->m32[3] >> shift) | (src->m32[2] << (32 - shift)) | sticky;
552
		src->m32[2] = (src->m32[2] >> shift) | (src->m32[1] << (32 - shift));
553
		src->m32[1] = (src->m32[1] >> shift) | (src->m32[0] << (32 - shift));
554
		src->m32[0] = (src->m32[0] >> shift);
555
		return;
556
	case -63 ... -32:
557
		shift = -shift - 32;
558
		sticky = 0;
559
		if ((src->m32[2] << (32 - shift)) || src->m32[3])
560
			sticky = 1;
561
		src->m32[3] = (src->m32[2] >> shift) | (src->m32[1] << (32 - shift)) | sticky;
562
		src->m32[2] = (src->m32[1] >> shift) | (src->m32[0] << (32 - shift));
563
		src->m32[1] = (src->m32[0] >> shift);
564
		src->m32[0] = 0;
565
		return;
566
	case -95 ... -64:
567
		shift = -shift - 64;
568
		sticky = 0;
569
		if ((src->m32[1] << (32 - shift)) || src->m32[2] || src->m32[3])
570
			sticky = 1;
571
		src->m32[3] = (src->m32[1] >> shift) | (src->m32[0] << (32 - shift)) | sticky;
572
		src->m32[2] = (src->m32[0] >> shift);
573
		src->m32[1] = src->m32[0] = 0;
574
		return;
575
	case -127 ... -96:
576
		shift = -shift - 96;
577
		sticky = 0;
578
		if ((src->m32[0] << (32 - shift)) || src->m32[1] || src->m32[2] || src->m32[3])
579
			sticky = 1;
580
		src->m32[3] = (src->m32[0] >> shift) | sticky;
581
		src->m32[2] = src->m32[1] = src->m32[0] = 0;
582
		return;
583
	}
584

585
	if (shift < 0 && (src->m32[0] || src->m32[1] || src->m32[2] || src->m32[3]))
586
		src->m32[3] = 1;
587
	else
588
		src->m32[3] = 0;
589
	src->m32[2] = 0;
590
	src->m32[1] = 0;
591
	src->m32[0] = 0;
592
}
593
#endif
594

595
static inline void fp_putmant128(struct fp_ext *dest, union fp_mant128 *src,
596
				 int shift)
597
{
598
	unsigned long tmp;
599

600
	switch (shift) {
601
	case 0:
602
		dest->mant.m64 = src->m64[0];
603
		dest->lowmant = src->m32[2] >> 24;
604
		if (src->m32[3] || (src->m32[2] << 8))
605
			dest->lowmant |= 1;
606
		break;
607
	case 1:
608
		asm volatile ("lsl.l #1,%0"
609
			: "=d" (tmp) : "0" (src->m32[2]));
610
		asm volatile ("roxl.l #1,%0"
611
			: "=d" (dest->mant.m32[1]) : "0" (src->m32[1]));
612
		asm volatile ("roxl.l #1,%0"
613
			: "=d" (dest->mant.m32[0]) : "0" (src->m32[0]));
614
		dest->lowmant = tmp >> 24;
615
		if (src->m32[3] || (tmp << 8))
616
			dest->lowmant |= 1;
617
		break;
618
	case 31:
619
		asm volatile ("lsr.l #1,%1; roxr.l #1,%0"
620
			: "=d" (dest->mant.m32[0])
621
			: "d" (src->m32[0]), "0" (src->m32[1]));
622
		asm volatile ("roxr.l #1,%0"
623
			: "=d" (dest->mant.m32[1]) : "0" (src->m32[2]));
624
		asm volatile ("roxr.l #1,%0"
625
			: "=d" (tmp) : "0" (src->m32[3]));
626
		dest->lowmant = tmp >> 24;
627
		if (src->m32[3] << 7)
628
			dest->lowmant |= 1;
629
		break;
630
	case 32:
631
		dest->mant.m32[0] = src->m32[1];
632
		dest->mant.m32[1] = src->m32[2];
633
		dest->lowmant = src->m32[3] >> 24;
634
		if (src->m32[3] << 8)
635
			dest->lowmant |= 1;
636
		break;
637
	}
638
}
639

640
#if 0 /* old code... */
641
static inline int fls(unsigned int a)
642
{
643
	int r;
644

645
	asm volatile ("bfffo %1{#0,#32},%0"
646
		      : "=d" (r) : "md" (a));
647
	return r;
648
}
649

650
/* fls = "find last set" (cf. ffs(3)) */
651
static inline int fls128(const int128 a)
652
{
653
	if (a[MSW128])
654
		return fls(a[MSW128]);
655
	if (a[NMSW128])
656
		return fls(a[NMSW128]) + 32;
657
	/* XXX: it probably never gets beyond this point in actual
658
	   use, but that's indicative of a more general problem in the
659
	   algorithm (i.e. as per the actual 68881 implementation, we
660
	   really only need at most 67 bits of precision [plus
661
	   overflow]) so I'm not going to fix it. */
662
	if (a[NLSW128])
663
		return fls(a[NLSW128]) + 64;
664
	if (a[LSW128])
665
		return fls(a[LSW128]) + 96;
666
	else
667
		return -1;
668
}
669

670
static inline int zerop128(const int128 a)
671
{
672
	return !(a[LSW128] | a[NLSW128] | a[NMSW128] | a[MSW128]);
673
}
674

675
static inline int nonzerop128(const int128 a)
676
{
677
	return (a[LSW128] | a[NLSW128] | a[NMSW128] | a[MSW128]);
678
}
679

680
/* Addition and subtraction */
681
/* Do these in "pure" assembly, because "extended" asm is unmanageable
682
   here */
683
static inline void add128(const int128 a, int128 b)
684
{
685
	/* rotating carry flags */
686
	unsigned int carry[2];
687

688
	carry[0] = a[LSW128] > (0xffffffff - b[LSW128]);
689
	b[LSW128] += a[LSW128];
690

691
	carry[1] = a[NLSW128] > (0xffffffff - b[NLSW128] - carry[0]);
692
	b[NLSW128] = a[NLSW128] + b[NLSW128] + carry[0];
693

694
	carry[0] = a[NMSW128] > (0xffffffff - b[NMSW128] - carry[1]);
695
	b[NMSW128] = a[NMSW128] + b[NMSW128] + carry[1];
696

697
	b[MSW128] = a[MSW128] + b[MSW128] + carry[0];
698
}
699

700
/* Note: assembler semantics: "b -= a" */
701
static inline void sub128(const int128 a, int128 b)
702
{
703
	/* rotating borrow flags */
704
	unsigned int borrow[2];
705

706
	borrow[0] = b[LSW128] < a[LSW128];
707
	b[LSW128] -= a[LSW128];
708

709
	borrow[1] = b[NLSW128] < a[NLSW128] + borrow[0];
710
	b[NLSW128] = b[NLSW128] - a[NLSW128] - borrow[0];
711

712
	borrow[0] = b[NMSW128] < a[NMSW128] + borrow[1];
713
	b[NMSW128] = b[NMSW128] - a[NMSW128] - borrow[1];
714

715
	b[MSW128] = b[MSW128] - a[MSW128] - borrow[0];
716
}
717

718
/* Poor man's 64-bit expanding multiply */
719
static inline void mul64(unsigned long long a, unsigned long long b, int128 c)
720
{
721
	unsigned long long acc;
722
	int128 acc128;
723

724
	zero128(acc128);
725
	zero128(c);
726

727
	/* first the low words */
728
	if (LO_WORD(a) && LO_WORD(b)) {
729
		acc = (long long) LO_WORD(a) * LO_WORD(b);
730
		c[NLSW128] = HI_WORD(acc);
731
		c[LSW128] = LO_WORD(acc);
732
	}
733
	/* Next the high words */
734
	if (HI_WORD(a) && HI_WORD(b)) {
735
		acc = (long long) HI_WORD(a) * HI_WORD(b);
736
		c[MSW128] = HI_WORD(acc);
737
		c[NMSW128] = LO_WORD(acc);
738
	}
739
	/* The middle words */
740
	if (LO_WORD(a) && HI_WORD(b)) {
741
		acc = (long long) LO_WORD(a) * HI_WORD(b);
742
		acc128[NMSW128] = HI_WORD(acc);
743
		acc128[NLSW128] = LO_WORD(acc);
744
		add128(acc128, c);
745
	}
746
	/* The first and last words */
747
	if (HI_WORD(a) && LO_WORD(b)) {
748
		acc = (long long) HI_WORD(a) * LO_WORD(b);
749
		acc128[NMSW128] = HI_WORD(acc);
750
		acc128[NLSW128] = LO_WORD(acc);
751
		add128(acc128, c);
752
	}
753
}
754

755
/* Note: unsigned */
756
static inline int cmp128(int128 a, int128 b)
757
{
758
	if (a[MSW128] < b[MSW128])
759
		return -1;
760
	if (a[MSW128] > b[MSW128])
761
		return 1;
762
	if (a[NMSW128] < b[NMSW128])
763
		return -1;
764
	if (a[NMSW128] > b[NMSW128])
765
		return 1;
766
	if (a[NLSW128] < b[NLSW128])
767
		return -1;
768
	if (a[NLSW128] > b[NLSW128])
769
		return 1;
770

771
	return (signed) a[LSW128] - b[LSW128];
772
}
773

774
inline void div128(int128 a, int128 b, int128 c)
775
{
776
	int128 mask;
777

778
	/* Algorithm:
779

780
	   Shift the divisor until it's at least as big as the
781
	   dividend, keeping track of the position to which we've
782
	   shifted it, i.e. the power of 2 which we've multiplied it
783
	   by.
784

785
	   Then, for this power of 2 (the mask), and every one smaller
786
	   than it, subtract the mask from the dividend and add it to
787
	   the quotient until the dividend is smaller than the raised
788
	   divisor.  At this point, divide the dividend and the mask
789
	   by 2 (i.e. shift one place to the right).  Lather, rinse,
790
	   and repeat, until there are no more powers of 2 left. */
791

792
	/* FIXME: needless to say, there's room for improvement here too. */
793

794
	/* Shift up */
795
	/* XXX: since it just has to be "at least as big", we can
796
	   probably eliminate this horribly wasteful loop.  I will
797
	   have to prove this first, though */
798
	set128(0, 0, 0, 1, mask);
799
	while (cmp128(b, a) < 0 && !btsthi128(b)) {
800
		lslone128(b);
801
		lslone128(mask);
802
	}
803

804
	/* Shift down */
805
	zero128(c);
806
	do {
807
		if (cmp128(a, b) >= 0) {
808
			sub128(b, a);
809
			add128(mask, c);
810
		}
811
		lsrone128(mask);
812
		lsrone128(b);
813
	} while (nonzerop128(mask));
814

815
	/* The remainder is in a... */
816
}
817
#endif
818

819
#endif	/* MULTI_ARITH_H */
820

821
Product

Resources

Company