Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
Download
241818 views
1
#ifndef _ASM_INCLUDE_
2
#define _ASM_INCLUDE_
3
4
/*
5
Copyright 2007 Andrew V. Sutherland
6
7
This file is part of smalljac.
8
9
smalljac is free software: you can redistribute it and/or modify
10
it under the terms of the GNU General Public License as published by
11
the Free Software Foundation, either version 2 of the License, or
12
(at your option) any later version.
13
14
smalljac is distributed in the hope that it will be useful,
15
but WITHOUT ANY WARRANTY; without even the implied warranty of
16
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17
GNU General Public License for more details.
18
19
You should have received a copy of the GNU General Public License
20
along with smalljac. If not, see <http://www.gnu.org/licenses/>.
21
*/
22
23
24
// This code could all be optimized - it is just a first stab
25
static inline unsigned long _asm_highbit (unsigned long x) { asm ("bsrq %0, %0" : "=r" (x) : "0" (x)); return x; }
26
27
#define _asm_div_q_q(q,r,x,y) asm ("divq %4" :"=a"(q) ,"=d"(r) : "0"(x), "1"(r), "rm"(y))
28
#define _asm_mult_1_1(z1,z0,x0,y0) asm ("mulq %3" :"=a"(z0) ,"=d"(z1) : "a"(x0), "rm"(y0))
29
#define _asm_mult_2_2_1(z1,z0,x1,x0,y0) asm ("mulq %3" :"=a"(z0) ,"=d"(z1) : "a"(x0), "rm"(y0));(z1)+=(y0)*(x1)
30
#define _asm_addto_2_2(z1,z0,x1,x0) asm ("addq %3,%0;adcq %5,%1":"=r"(z0),"=r"(z1): "0"(z0), "rim"(x0), "1"(z1), "rim"(x1):"cc")
31
#define _asm_addto_2_1(z1,z0,x0) asm ("addq %3,%0;adcq $0,%1":"=r"(z0),"=r"(z1): "0"(z0), "rim"(x0), "1"(z1):"cc")
32
#define _asm_addto_3_3(z2,z1,z0,x2,x1,x0) asm ("addq %4,%0;adcq %6,%1;adcq %8,%2":"=r"(z0),"=r"(z1),"=r"(z2): "0"(z0), "rim"(x0), "1"(z1), "rim"(x1), "2"(z2), "rim"(x2) :"cc")
33
#define _asm_addto_3_2(z2,z1,z0,x1,x0) asm ("addq %4,%0;adcq %6,%1;adcq 0,%2":"=r"(z0),"=r"(z1),"=r"(z2): "0"(z0), "rim"(x0), "1"(z1), "rim"(x1), "2"(z2) :"cc")
34
#define _asm_subfrom_2_2(z1,z0,x1,x0) asm ("subq %3,%0;sbbq %5,%1":"=r"(z0),"=r"(z1): "0"(z0), "rim"(x0), "1"(z1), "rim"(x1):"cc")
35
// increment needs to propogate the carry - performance not critical here anyway
36
#define _asm_inc_2(z1,z0) asm ("addq $1,%0;adcq $0,%1":"=r"(z0),"=r"(z1): "0"(z0), "1"(z1):"cc")
37
//#define _asm_shiftl_2(z1,z0) asm ("shlq %0,1;rclq %1,1":"=r"(z0),"=r"(z1): "0"(z0), "1"(z1):"cc")
38
//#define _asm_shiftr_2(z1,z0) asm ("shrq %1,1;rcrq %0,1":"=r"(z0),"=r"(z1): "0"(z0), "1"(z1):"cc")
39
40
41
#define _asm_mult_3_2_1(z2,z1,z0,x1,x0,y0) { register unsigned long __u; \
42
_asm_mult_1_1 (__u,z0,x0,y0); \
43
_asm_mult_1_1 (z2,z1,x1,y0); \
44
_asm_addto_2_1 (z2,z1,__u); }
45
46
// This function assumes that x[1] and y[1] < 2^31
47
static inline unsigned long _asm_mult_3_2_2 (unsigned long z[3], unsigned long x[2], unsigned long y[2])
48
{
49
register unsigned long U, V, R0,R1, R2;
50
51
R1 = 0;
52
_asm_mult_1_1(R0,z[0],x[0],y[0]);
53
_asm_mult_1_1(U,V,x[0],y[1]);
54
_asm_addto_2_2(R1,R0,U,V);
55
_asm_mult_1_1(U,V,x[1],y[0]);
56
_asm_addto_2_2(R1,R0,U,V);
57
z[1] = R0;
58
z[2] = x[1]*y[1]+R1;
59
}
60
61
// This function assumes that x[1] and y[1] < 2^31
62
static inline unsigned long _asm_mult_3_2_2r (unsigned long z2, unsigned long z1, unsigned long z0, unsigned long x[2], unsigned long y[2])
63
{
64
register unsigned long U, V, R0,R1, R2;
65
66
R1 = 0;
67
_asm_mult_1_1(R0,z0,x[0],y[0]);
68
_asm_mult_1_1(U,V,x[0],y[1]);
69
_asm_addto_2_2(R1,R0,U,V);
70
_asm_mult_1_1(U,V,x[1],y[0]);
71
_asm_addto_2_2(R1,R0,U,V);
72
z1 = R0;
73
z2 = x[1]*y[1]+R1;
74
}
75
76
77
// This function assumes that x[1] < 2^31. For no obvious reason, this is slower than multiplying?!
78
static inline unsigned long _asm_square_3_2 (unsigned long z[3], unsigned long x[2])
79
{
80
register unsigned long U, V, R0,R1, R2;
81
82
R1 = 0;
83
_asm_mult_1_1(R0,z[0],x[0],x[0]);
84
_asm_mult_1_1(U,V,x[0],x[1]);
85
_asm_addto_2_2(R1,R0,U,V);
86
_asm_addto_2_2(R1,R0,U,V);
87
z[1] = R0;
88
z[2] = x[1]*x[1]+R1;
89
}
90
#endif
91
92