Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
PojavLauncherTeam
GitHub Repository: PojavLauncherTeam/mesa
Path: blob/21.2-virgl/src/gallium/drivers/nouveau/codegen/lib/gf100.asm
4574 views
1
.section #gf100_builtin_code
2
// DIV U32
3
//
4
// UNR recurrence (q = a / b):
5
// look for z such that 2^32 - b <= b * z < 2^32
6
// then q - 1 <= (a * z) / 2^32 <= q
7
//
8
// INPUT: $r0: dividend, $r1: divisor
9
// OUTPUT: $r0: result, $r1: modulus
10
// CLOBBER: $r2 - $r3, $p0 - $p1
11
// SIZE: 22 / 14 * 8 bytes
12
//
13
gf100_div_u32:
14
bfind u32 $r2 $r1
15
xor b32 $r2 $r2 0x1f
16
mov b32 $r3 0x1
17
shl b32 $r2 $r3 clamp $r2
18
cvt u32 $r1 neg u32 $r1
19
mul $r3 u32 $r1 u32 $r2
20
add $r2 (mul high u32 $r2 u32 $r3) $r2
21
mul $r3 u32 $r1 u32 $r2
22
add $r2 (mul high u32 $r2 u32 $r3) $r2
23
mul $r3 u32 $r1 u32 $r2
24
add $r2 (mul high u32 $r2 u32 $r3) $r2
25
mul $r3 u32 $r1 u32 $r2
26
add $r2 (mul high u32 $r2 u32 $r3) $r2
27
mul $r3 u32 $r1 u32 $r2
28
add $r2 (mul high u32 $r2 u32 $r3) $r2
29
mov b32 $r3 $r0
30
mul high $r0 u32 $r0 u32 $r2
31
cvt u32 $r2 neg u32 $r1
32
add $r1 (mul u32 $r1 u32 $r0) $r3
33
set $p0 0x1 ge u32 $r1 $r2
34
$p0 sub b32 $r1 $r1 $r2
35
$p0 add b32 $r0 $r0 0x1
36
$p0 set $p0 0x1 ge u32 $r1 $r2
37
$p0 sub b32 $r1 $r1 $r2
38
$p0 add b32 $r0 $r0 0x1
39
ret
40
41
// DIV S32, like DIV U32 after taking ABS(inputs)
42
//
43
// INPUT: $r0: dividend, $r1: divisor
44
// OUTPUT: $r0: result, $r1: modulus
45
// CLOBBER: $r2 - $r3, $p0 - $p3
46
//
47
gf100_div_s32:
48
set $p2 0x1 lt s32 $r0 0x0
49
set $p3 0x1 lt s32 $r1 0x0 xor $p2
50
cvt s32 $r0 abs s32 $r0
51
cvt s32 $r1 abs s32 $r1
52
bfind u32 $r2 $r1
53
xor b32 $r2 $r2 0x1f
54
mov b32 $r3 0x1
55
shl b32 $r2 $r3 clamp $r2
56
cvt u32 $r1 neg u32 $r1
57
mul $r3 u32 $r1 u32 $r2
58
add $r2 (mul high u32 $r2 u32 $r3) $r2
59
mul $r3 u32 $r1 u32 $r2
60
add $r2 (mul high u32 $r2 u32 $r3) $r2
61
mul $r3 u32 $r1 u32 $r2
62
add $r2 (mul high u32 $r2 u32 $r3) $r2
63
mul $r3 u32 $r1 u32 $r2
64
add $r2 (mul high u32 $r2 u32 $r3) $r2
65
mul $r3 u32 $r1 u32 $r2
66
add $r2 (mul high u32 $r2 u32 $r3) $r2
67
mov b32 $r3 $r0
68
mul high $r0 u32 $r0 u32 $r2
69
cvt u32 $r2 neg u32 $r1
70
add $r1 (mul u32 $r1 u32 $r0) $r3
71
set $p0 0x1 ge u32 $r1 $r2
72
$p0 sub b32 $r1 $r1 $r2
73
$p0 add b32 $r0 $r0 0x1
74
$p0 set $p0 0x1 ge u32 $r1 $r2
75
$p0 sub b32 $r1 $r1 $r2
76
$p0 add b32 $r0 $r0 0x1
77
$p3 cvt s32 $r0 neg s32 $r0
78
$p2 cvt s32 $r1 neg s32 $r1
79
ret
80
81
// RCP F64: Newton Raphson reciprocal(x): r_{i+1} = r_i * (2.0 - x * r_i)
82
//
83
// INPUT: $r0d (x)
84
// OUTPUT: $r0d (rcp(x))
85
// CLOBBER: $r2 - $r7
86
// SIZE: 9 * 8 bytes
87
//
88
gf100_rcp_f64:
89
nop
90
ret
91
92
// RSQ F64: Newton Raphson rsqrt(x): r_{i+1} = r_i * (1.5 - 0.5 * x * r_i * r_i)
93
//
94
// INPUT: $r0d (x)
95
// OUTPUT: $r0d (rsqrt(x))
96
// CLOBBER: $r2 - $r7
97
// SIZE: 14 * 8 bytes
98
//
99
gf100_rsq_f64:
100
nop
101
ret
102
103
.section #gf100_builtin_offsets
104
.b64 #gf100_div_u32
105
.b64 #gf100_div_s32
106
.b64 #gf100_rcp_f64
107
.b64 #gf100_rsq_f64
108
109