Path: blob/21.2-virgl/src/gallium/drivers/nouveau/codegen/lib/gf100.asm
4574 views
.section #gf100_builtin_code1// DIV U322//3// UNR recurrence (q = a / b):4// look for z such that 2^32 - b <= b * z < 2^325// then q - 1 <= (a * z) / 2^32 <= q6//7// INPUT: $r0: dividend, $r1: divisor8// OUTPUT: $r0: result, $r1: modulus9// CLOBBER: $r2 - $r3, $p0 - $p110// SIZE: 22 / 14 * 8 bytes11//12gf100_div_u32:13bfind u32 $r2 $r114xor b32 $r2 $r2 0x1f15mov b32 $r3 0x116shl b32 $r2 $r3 clamp $r217cvt u32 $r1 neg u32 $r118mul $r3 u32 $r1 u32 $r219add $r2 (mul high u32 $r2 u32 $r3) $r220mul $r3 u32 $r1 u32 $r221add $r2 (mul high u32 $r2 u32 $r3) $r222mul $r3 u32 $r1 u32 $r223add $r2 (mul high u32 $r2 u32 $r3) $r224mul $r3 u32 $r1 u32 $r225add $r2 (mul high u32 $r2 u32 $r3) $r226mul $r3 u32 $r1 u32 $r227add $r2 (mul high u32 $r2 u32 $r3) $r228mov b32 $r3 $r029mul high $r0 u32 $r0 u32 $r230cvt u32 $r2 neg u32 $r131add $r1 (mul u32 $r1 u32 $r0) $r332set $p0 0x1 ge u32 $r1 $r233$p0 sub b32 $r1 $r1 $r234$p0 add b32 $r0 $r0 0x135$p0 set $p0 0x1 ge u32 $r1 $r236$p0 sub b32 $r1 $r1 $r237$p0 add b32 $r0 $r0 0x138ret3940// DIV S32, like DIV U32 after taking ABS(inputs)41//42// INPUT: $r0: dividend, $r1: divisor43// OUTPUT: $r0: result, $r1: modulus44// CLOBBER: $r2 - $r3, $p0 - $p345//46gf100_div_s32:47set $p2 0x1 lt s32 $r0 0x048set $p3 0x1 lt s32 $r1 0x0 xor $p249cvt s32 $r0 abs s32 $r050cvt s32 $r1 abs s32 $r151bfind u32 $r2 $r152xor b32 $r2 $r2 0x1f53mov b32 $r3 0x154shl b32 $r2 $r3 clamp $r255cvt u32 $r1 neg u32 $r156mul $r3 u32 $r1 u32 $r257add $r2 (mul high u32 $r2 u32 $r3) $r258mul $r3 u32 $r1 u32 $r259add $r2 (mul high u32 $r2 u32 $r3) $r260mul $r3 u32 $r1 u32 $r261add $r2 (mul high u32 $r2 u32 $r3) $r262mul $r3 u32 $r1 u32 $r263add $r2 (mul high u32 $r2 u32 $r3) $r264mul $r3 u32 $r1 u32 $r265add $r2 (mul high u32 $r2 u32 $r3) $r266mov b32 $r3 $r067mul high $r0 u32 $r0 u32 $r268cvt u32 $r2 neg u32 $r169add $r1 (mul u32 $r1 u32 $r0) $r370set $p0 0x1 ge u32 $r1 $r271$p0 sub b32 $r1 $r1 $r272$p0 add b32 $r0 $r0 0x173$p0 set $p0 0x1 ge u32 $r1 $r274$p0 sub b32 $r1 $r1 $r275$p0 add b32 $r0 $r0 0x176$p3 cvt s32 $r0 neg s32 $r077$p2 cvt s32 $r1 neg s32 $r178ret7980// RCP F64: Newton Raphson reciprocal(x): r_{i+1} = r_i * (2.0 - x * r_i)81//82// INPUT: $r0d (x)83// OUTPUT: $r0d (rcp(x))84// CLOBBER: $r2 - $r785// SIZE: 9 * 8 bytes86//87gf100_rcp_f64:88nop89ret9091// RSQ F64: Newton Raphson rsqrt(x): r_{i+1} = r_i * (1.5 - 0.5 * x * r_i * r_i)92//93// INPUT: $r0d (x)94// OUTPUT: $r0d (rsqrt(x))95// CLOBBER: $r2 - $r796// SIZE: 14 * 8 bytes97//98gf100_rsq_f64:99nop100ret101102.section #gf100_builtin_offsets103.b64 #gf100_div_u32104.b64 #gf100_div_s32105.b64 #gf100_rcp_f64106.b64 #gf100_rsq_f64107108109