Path: blob/21.2-virgl/src/gallium/drivers/nouveau/codegen/lib/gk104.asm
4574 views
.section #gk104_builtin_code1// DIV U322//3// UNR recurrence (q = a / b):4// look for z such that 2^32 - b <= b * z < 2^325// then q - 1 <= (a * z) / 2^32 <= q6//7// INPUT: $r0: dividend, $r1: divisor8// OUTPUT: $r0: result, $r1: modulus9// CLOBBER: $r2 - $r3, $p0 - $p110// SIZE: 22 / 14 * 8 bytes11//12gk104_div_u32:13sched 0x28 0x4 0x28 0x4 0x28 0x28 0x2814bfind u32 $r2 $r115long xor b32 $r2 $r2 0x1f16long mov b32 $r3 0x117shl b32 $r2 $r3 clamp $r218long cvt u32 $r1 neg u32 $r119long mul $r3 u32 $r1 u32 $r220add $r2 (mul high u32 $r2 u32 $r3) $r221sched 0x28 0x28 0x28 0x28 0x28 0x28 0x2822mul $r3 u32 $r1 u32 $r223add $r2 (mul high u32 $r2 u32 $r3) $r224mul $r3 u32 $r1 u32 $r225add $r2 (mul high u32 $r2 u32 $r3) $r226mul $r3 u32 $r1 u32 $r227add $r2 (mul high u32 $r2 u32 $r3) $r228mul $r3 u32 $r1 u32 $r229sched 0x4 0x28 0x4 0x28 0x28 0x2c 0x430add $r2 (mul high u32 $r2 u32 $r3) $r231mov b32 $r3 $r032mul high $r0 u32 $r0 u32 $r233long cvt u32 $r2 neg u32 $r134long add $r1 (mul u32 $r1 u32 $r0) $r335set $p0 0x1 ge u32 $r1 $r236$p0 sub b32 $r1 $r1 $r237sched 0x28 0x2c 0x4 0x20 0x2e 0x28 0x2038$p0 add b32 $r0 $r0 0x139$p0 set $p0 0x1 ge u32 $r1 $r240$p0 sub b32 $r1 $r1 $r241$p0 add b32 $r0 $r0 0x142long ret4344// DIV S32, like DIV U32 after taking ABS(inputs)45//46// INPUT: $r0: dividend, $r1: divisor47// OUTPUT: $r0: result, $r1: modulus48// CLOBBER: $r2 - $r3, $p0 - $p349//50gk104_div_s32:51set $p2 0x1 lt s32 $r0 0x052set $p3 0x1 lt s32 $r1 0x0 xor $p253sched 0x20 0x28 0x28 0x4 0x28 0x04 0x2854long cvt s32 $r0 abs s32 $r055long cvt s32 $r1 abs s32 $r156bfind u32 $r2 $r157long xor b32 $r2 $r2 0x1f58long mov b32 $r3 0x159shl b32 $r2 $r3 clamp $r260cvt u32 $r1 neg u32 $r161sched 0x28 0x28 0x28 0x28 0x28 0x28 0x2862mul $r3 u32 $r1 u32 $r263add $r2 (mul high u32 $r2 u32 $r3) $r264mul $r3 u32 $r1 u32 $r265add $r2 (mul high u32 $r2 u32 $r3) $r266mul $r3 u32 $r1 u32 $r267add $r2 (mul high u32 $r2 u32 $r3) $r268mul $r3 u32 $r1 u32 $r269sched 0x28 0x28 0x4 0x28 0x04 0x28 0x2870add $r2 (mul high u32 $r2 u32 $r3) $r271mul $r3 u32 $r1 u32 $r272add $r2 (mul high u32 $r2 u32 $r3) $r273mov b32 $r3 $r074mul high $r0 u32 $r0 u32 $r275long cvt u32 $r2 neg u32 $r176long add $r1 (mul u32 $r1 u32 $r0) $r377sched 0x2c 0x04 0x28 0x2c 0x04 0x28 0x2078set $p0 0x1 ge u32 $r1 $r279$p0 sub b32 $r1 $r1 $r280$p0 add b32 $r0 $r0 0x181$p0 set $p0 0x1 ge u32 $r1 $r282$p0 sub b32 $r1 $r1 $r283long $p0 add b32 $r0 $r0 0x184long $p3 cvt s32 $r0 neg s32 $r085sched 0x04 0x2e 0x04 0x28 0x04 0x20 0x2c86$p2 cvt s32 $r1 neg s32 $r187long ret8889// SULDP [for each format]90// $r4d: address91// $r2: surface info (format)92// $p0: access predicate93// $p1, $p2: caching predicate (00: cv, 01: ca, 10: cg)94//95// RGBA3296$p1 suldgb b128 $r0q ca zero u8 g[$r4d] $r2 $p097set $p1 0x1 $p1 xor not $p298$p2 suldgb b128 $r0q cg zero u8 g[$r4d] $r2 $p099$p1 suldgb b128 $r0q cv zero u8 g[$r4d] $r2 $p0100long ret101// RGBA16_UNORM102sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00103$p1 suldgb b128 $r0q ca zero u8 g[$r4d] $r2 $p0104set $p1 0x1 $p1 xor not $p2105$p2 suldgb b128 $r0q cg zero u8 g[$r4d] $r2 $p0106$p1 suldgb b128 $r0q cv zero u8 g[$r4d] $r2 $p0107cvt rn f32 $r3 u16 1 $r1108cvt rn f32 $r2 u16 0 $r1109mul f32 $r3 $r3 0x37800074110sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00111cvt rn f32 $r1 u16 1 $r0112mul f32 $r2 $r2 0x37800074113cvt rn f32 $r0 u16 0 $r0114mul f32 $r1 $r1 0x37800074115mul f32 $r0 $r0 0x37800074116long ret117// RGBA16_SNORM118$p1 suldgb b64 $r0d ca zero u8 g[$r4d] $r2 $p0119sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00120set $p1 0x1 $p1 xor not $p2121$p2 suldgb b64 $r0d cg zero u8 g[$r4d] $r2 $p0122$p1 suldgb b64 $r0d cv zero u8 g[$r4d] $r2 $p0123cvt rn f32 $r3 s16 1 $r1124cvt rn f32 $r2 s16 0 $r1125mul f32 $r3 $r3 0x38000187126cvt rn f32 $r1 s16 1 $r0127sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00128mul f32 $r2 $r2 0x38000187129cvt rn f32 $r0 s16 0 $r0130mul f32 $r1 $r1 0x38000187131mul f32 $r0 $r0 0x38000187132long ret133// RGBA16_SINT134$p1 suldgb b64 $r0d ca zero u8 g[$r4d] $r2 $p0135set $p1 0x1 $p1 xor not $p2136sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00137$p2 suldgb b64 $r0d cg zero u8 g[$r4d] $r2 $p0138$p1 suldgb b64 $r0d cv zero u8 g[$r4d] $r2 $p0139cvt s32 $r3 s16 1 $r1140cvt s32 $r2 s16 0 $r1141cvt s32 $r1 s16 1 $r0142cvt s32 $r0 s16 0 $r0143long ret144// RGBA16_UINT145sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00146$p1 suldgb b64 $r0d ca zero u8 g[$r4d] $r2 $p0147set $p1 0x1 $p1 xor not $p2148$p2 suldgb b64 $r0d cg zero u8 g[$r4d] $r2 $p0149$p1 suldgb b64 $r0d cv zero u8 g[$r4d] $r2 $p0150cvt u32 $r3 u16 1 $r1151cvt u32 $r2 u16 0 $r1152cvt u32 $r1 u16 1 $r0153sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00154cvt u32 $r0 u16 0 $r0155long ret156// RGBA16_FLOAT157$p1 suldgb b64 $r0d ca zero u8 g[$r4d] $r2 $p0158set $p1 0x1 $p1 xor not $p2159$p2 suldgb b64 $r0d cg zero u8 g[$r4d] $r2 $p0160$p1 suldgb b64 $r0d cv zero u8 g[$r4d] $r2 $p0161cvt f32 $r3 f16 $r1 1162sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00163cvt f32 $r2 f16 $r1 0164cvt f32 $r1 f16 $r0 1165cvt f32 $r0 f16 $r0 0166long ret167// RG32_FLOAT168$p1 suldgb b64 $r0d ca zero u8 g[$r4d] $r2 $p0169set $p1 0x1 $p1 xor not $p2170$p2 suldgb b64 $r0d cg zero u8 g[$r4d] $r2 $p0171sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00172$p1 suldgb b64 $r0d cv zero u8 g[$r4d] $r2 $p0173long mov b32 $r2 0x00000000174long mov b32 $r3 0x3f800000175long ret176// RG32_xINT177$p1 suldgb b64 $r0d ca zero u8 g[$r4d] $r2 $p0178set $p1 0x1 $p1 xor not $p2179$p2 suldgb b64 $r0d cg zero u8 g[$r4d] $r2 $p0180sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00181$p1 suldgb b64 $r0d cv zero u8 g[$r4d] $r2 $p0182long mov b32 $r2 0x00000000183long mov b32 $r3 0x00000001184long ret185// RGB10A2_UNORM186$p1 suldgb b32 $r0 ca zero u8 g[$r4d] $r2 $p0187set $p1 0x1 $p1 xor not $p2188$p2 suldgb b32 $r0 cg zero u8 g[$r4d] $r2 $p0189sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00190$p1 suldgb b32 $r0 cv zero u8 g[$r4d] $r2 $p0191ext u32 $r1 $r0 0x0a0a192long mov b32 $r3 0x3f800000193ext u32 $r2 $r0 0x0a14194long and b32 $r0 $r0 0x3ff195cvt rn f32 $r2 u16 0 $r2196cvt rn f32 $r1 u16 0 $r1197sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00198mul f32 $r2 $r2 0x3a802007199cvt rn f32 $r0 u16 0 $r0200mul f32 $r1 $r1 0x3a802007201mul f32 $r0 $r0 0x3a802007202long ret203// RGB10A2_UINT204$p1 suldgb b32 $r0 ca zero u8 g[$r4d] $r2 $p0205set $p1 0x1 $p1 xor not $p2206sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00207$p2 suldgb b32 $r0 cg zero u8 g[$r4d] $r2 $p0208$p1 suldgb b32 $r0 cv zero u8 g[$r4d] $r2 $p0209ext u32 $r1 $r0 0x0a0a210long mov b32 $r3 0x00000001211ext u32 $r2 $r0 0x0a14212long and b32 $r0 $r0 0x3ff213long ret214// RGBA8_UNORM215sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00216$p1 suldgb b32 $r0 ca zero u8 g[$r4d] $r2 $p0217set $p1 0x1 $p1 xor not $p2218$p2 suldgb b32 $r0 cg zero u8 g[$r4d] $r2 $p0219$p1 suldgb b32 $r0 cv zero u8 g[$r4d] $r2 $p0220cvt rn f32 $r3 u8 3 $r0221cvt rn f32 $r2 u8 2 $r0222mul f32 $r3 $r3 0x3b808081223sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00224cvt rn f32 $r1 u8 1 $r0225mul f32 $r2 $r2 0x3b808081226cvt rn f32 $r0 u8 0 $r0227mul f32 $r1 $r1 0x3b808081228mul f32 $r0 $r0 0x3b808081229long ret230// RGBA8_SNORM231$p1 suldgb b32 $r0 ca zero u8 g[$r4d] $r2 $p0232sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00233set $p1 0x1 $p1 xor not $p2234$p2 suldgb b32 $r0 cg zero u8 g[$r4d] $r2 $p0235$p1 suldgb b32 $r0 cv zero u8 g[$r4d] $r2 $p0236cvt rn f32 $r3 s8 3 $r0237cvt rn f32 $r2 s8 2 $r0238mul f32 $r3 $r3 0x3c010204239cvt rn f32 $r1 s8 1 $r0240sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00241mul f32 $r2 $r2 0x3c010204242cvt rn f32 $r0 s8 0 $r0243mul f32 $r1 $r1 0x3c010204244mul f32 $r0 $r0 0x3c010204245long ret246// RGBA8_SINT247$p1 suldgb b32 $r0 ca zero u8 g[$r4d] $r2 $p0248set $p1 0x1 $p1 xor not $p2249sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00250$p2 suldgb b32 $r0 cg zero u8 g[$r4d] $r2 $p0251$p1 suldgb b32 $r0 cv zero u8 g[$r4d] $r2 $p0252cvt s32 $r3 s8 3 $r0253cvt s32 $r2 s8 2 $r0254cvt s32 $r1 s8 1 $r0255cvt s32 $r0 s8 0 $r0256long ret257// RGBA8_UINT258sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00259$p1 suldgb b32 $r0 ca zero u8 g[$r4d] $r2 $p0260set $p1 0x1 $p1 xor not $p2261$p2 suldgb b32 $r0 cg zero u8 g[$r4d] $r2 $p0262$p1 suldgb b32 $r0 cv zero u8 g[$r4d] $r2 $p0263cvt u32 $r3 u8 3 $r0264cvt u32 $r2 u8 2 $r0265cvt u32 $r1 u8 1 $r0266sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00267cvt u32 $r0 u8 0 $r0268long ret269// R5G6B5_UNORM270$p1 suldgb u16 $r0 ca zero u8 g[$r4d] $r2 $p0271set $p1 0x1 $p1 xor not $p2272$p2 suldgb u16 $r0 cg zero u8 g[$r4d] $r2 $p0273$p1 suldgb u16 $r0 cv zero u8 g[$r4d] $r2 $p0274ext u32 $r1 $r0 0x0605275sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00276long mov b32 $r3 0x3f800000277ext u32 $r2 $r0 0x050b278long and b32 $r0 $r0 0x1f279cvt rn f32 $r2 u8 0 $r2280cvt rn f32 $r1 u8 0 $r1281mul f32 $r2 $r2 0x3d042108282cvt rn f32 $r0 u8 0 $r0283sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00284mul f32 $r1 $r1 0x3c820821285mul f32 $r0 $r0 0x3d042108286long ret287// R5G5B5X1_UNORM288$p1 suldgb u16 $r0 ca zero u8 g[$r4d] $r2 $p0289set $p1 0x1 $p1 xor not $p2290$p2 suldgb u16 $r0 cg zero u8 g[$r4d] $r2 $p0291$p1 suldgb u16 $r0 cv zero u8 g[$r4d] $r2 $p0292sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00293ext u32 $r1 $r0 0x0505294ext u32 $r2 $r0 0x050a295long and b32 $r0 $r0 0x1f296long mov b32 $r3 0x3f800000297cvt rn f32 $r2 u8 0 $r2298cvt rn f32 $r1 u8 0 $r1299cvt rn f32 $r0 u8 0 $r0300sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00301mul f32 $r2 $r2 0x3d042108302mul f32 $r1 $r1 0x3d042108303mul f32 $r0 $r0 0x3d042108304long ret305// RG16_UNORM306$p1 suldgb b32 $r0 ca zero u8 g[$r4d] $r2 $p0307set $p1 0x1 $p1 xor not $p2308$p2 suldgb b32 $r0 cg zero u8 g[$r4d] $r2 $p0309sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00310$p1 suldgb b32 $r0 cv zero u8 g[$r4d] $r2 $p0311cvt rn f32 $r1 u16 1 $r0312cvt rn f32 $r0 u16 0 $r0313mul f32 $r1 $r1 0x37800074314mul f32 $r0 $r0 0x37800074315long mov b32 $r2 0x00000000316long mov b32 $r3 0x3f800000317sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00318long ret319// RG16_SNORM320$p1 suldgb b32 $r0 ca zero u8 g[$r4d] $r2 $p0321set $p1 0x1 $p1 xor not $p2322$p2 suldgb b32 $r0 cg zero u8 g[$r4d] $r2 $p0323$p1 suldgb b32 $r0 cv zero u8 g[$r4d] $r2 $p0324mov b32 $r3 0x3f800000325cvt rn f32 $r1 s16 1 $r0326sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00327mov b32 $r2 0x00000000328cvt rn f32 $r0 s16 0 $r0329mul f32 $r1 $r1 0x38000187330mul f32 $r0 $r0 0x38000187331long ret332// RG16_SINT333$p1 suldgb b32 $r0 ca zero u8 g[$r4d] $r2 $p0334set $p1 0x1 $p1 xor not $p2335sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00336$p2 suldgb b32 $r0 cg zero u8 g[$r4d] $r2 $p0337$p1 suldgb b32 $r0 cv zero u8 g[$r4d] $r2 $p0338mov b32 $r3 0x00000001339cvt s32 $r1 s16 1 $r0340mov b32 $r2 0x00000000341cvt s32 $r0 s16 0 $r0342long ret343// RG16_UINT344sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00345$p1 suldgb b32 $r0 ca zero u8 g[$r4d] $r2 $p0346set $p1 0x1 $p1 xor not $p2347$p2 suldgb b32 $r0 cg zero u8 g[$r4d] $r2 $p0348$p1 suldgb b32 $r0 cv zero u8 g[$r4d] $r2 $p0349mov b32 $r3 0x00000001350cvt u32 $r1 u16 1 $r0351mov b32 $r2 0x00000000352sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00353cvt u32 $r0 u16 0 $r0354long ret355// RG16_FLOAT356$p1 suldgb b32 $r0 ca zero u8 g[$r4d] $r2 $p0357set $p1 0x1 $p1 xor not $p2358$p2 suldgb b32 $r0 cg zero u8 g[$r4d] $r2 $p0359$p1 suldgb b32 $r0 cv zero u8 g[$r4d] $r2 $p0360mov b32 $r3 0x3f800000361sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00362cvt f32 $r1 f16 $r0 1363mov b32 $r2 0x00000000364cvt f32 $r0 f16 $r0 0365long ret366// R32_FLOAT367$p1 suldgb b32 $r0 ca zero u8 g[$r4d] $r2 $p0368set $p1 0x1 $p1 xor not $p2369$p2 suldgb b32 $r0 cg zero u8 g[$r4d] $r2 $p0370sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00371$p1 suldgb b32 $r0 cv zero u8 g[$r4d] $r2 $p0372long mov b32 $r3 0x3f800000373long mov b32 $r2 0x00000000374long mov b32 $r1 0x00000000375long ret376// R32_xINT377$p1 suldgb b32 $r0 ca zero u8 g[$r4d] $r2 $p0378set $p1 0x1 $p1 xor not $p2379sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00380$p2 suldgb b32 $r0 cg zero u8 g[$r4d] $r2 $p0381$p1 suldgb b32 $r0 cv zero u8 g[$r4d] $r2 $p0382long mov b32 $r3 0x00000001383long mov b32 $r2 0x00000000384long mov b32 $r1 0x00000000385long ret386// RG8_UNORM387$p1 suldgb u16 $r0 ca zero u8 g[$r4d] $r2 $p0388sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00389set $p1 0x1 $p1 xor not $p2390$p2 suldgb u16 $r0 cg zero u8 g[$r4d] $r2 $p0391$p1 suldgb u16 $r0 cv zero u8 g[$r4d] $r2 $p0392mov b32 $r3 0x3f800000393cvt rn f32 $r1 u8 1 $r0394mov b32 $r2 0x00000000395cvt rn f32 $r0 u8 0 $r0396sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00397mul f32 $r1 $r1 0x3b808081398mul f32 $r0 $r0 0x3b808081399long ret400// RG8_SNORM401$p1 suldgb u16 $r0 ca zero u8 g[$r4d] $r2 $p0402set $p1 0x1 $p1 xor not $p2403$p2 suldgb u16 $r0 cg zero u8 g[$r4d] $r2 $p0404$p1 suldgb u16 $r0 cv zero u8 g[$r4d] $r2 $p0405sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00406long mov b32 $r3 0x3f800000407cvt rn f32 $r1 s8 1 $r0408long mov b32 $r2 0x00000000409cvt rn f32 $r0 s8 0 $r0410mul f32 $r1 $r1 0x3c010204411mul f32 $r0 $r0 0x3c010204412long ret413// RG8_UINT414sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00415$p1 suldgb u16 $r0 ca zero u8 g[$r4d] $r2 $p0416set $p1 0x1 $p1 xor not $p2417$p2 suldgb u16 $r0 cg zero u8 g[$r4d] $r2 $p0418$p1 suldgb u16 $r0 cv zero u8 g[$r4d] $r2 $p0419long mov b32 $r3 0x00000001420cvt u32 $r1 u8 1 $r0421long mov b32 $r2 0x00000000422sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00423cvt u32 $r0 u8 0 $r0424long ret425// RG8_SINT426$p1 suldgb u16 $r0 ca zero u8 g[$r4d] $r2 $p0427set $p1 0x1 $p1 xor not $p2428$p2 suldgb u16 $r0 cg zero u8 g[$r4d] $r2 $p0429$p1 suldgb u16 $r0 cv zero u8 g[$r4d] $r2 $p0430long mov b32 $r3 0x00000001431sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00432cvt s32 $r1 s8 1 $r0433long mov b32 $r2 0x00000000434cvt s32 $r0 s8 0 $r0435long ret436// R16_UNORM437$p1 suldgb u16 $r0 ca zero u8 g[$r4d] $r2 $p0438set $p1 0x1 $p1 xor not $p2439$p2 suldgb u16 $r0 cg zero u8 g[$r4d] $r2 $p0440sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00441$p1 suldgb u16 $r0 cv zero u8 g[$r4d] $r2 $p0442long mov b32 $r3 0x3f800000443cvt rn f32 $r0 u16 0 $r0444long mov b32 $r2 0x00000000445long mov b32 $r1 0x00000000446mul f32 $r0 $r0 0x37800074447long ret448// R16_SNORM449sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00450$p1 suldgb u16 $r0 ca zero u8 g[$r4d] $r2 $p0451set $p1 0x1 $p1 xor not $p2452$p2 suldgb u16 $r0 cg zero u8 g[$r4d] $r2 $p0453$p1 suldgb u16 $r0 cv zero u8 g[$r4d] $r2 $p0454mov b32 $r3 0x3f800000455cvt rn f32 $r0 s16 0 $r0456long mov b32 $r2 0x00000000457sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00458long mov b32 $r1 0x00000000459mul f32 $r0 $r0 0x38000187460long ret461// R16_SINT462$p1 suldgb s16 $r0 ca zero u8 g[$r4d] $r2 $p0463set $p1 0x1 $p1 xor not $p2464$p2 suldgb s16 $r0 cg zero u8 g[$r4d] $r2 $p0465$p1 suldgb s16 $r0 cv zero u8 g[$r4d] $r2 $p0466sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00467long mov b32 $r3 0x00000001468long mov b32 $r2 0x00000000469long mov b32 $r1 0x00000000470long ret471// R16_UINT472$p1 suldgb u16 $r0 ca zero u8 g[$r4d] $r2 $p0473set $p1 0x1 $p1 xor not $p2474$p2 suldgb u16 $r0 cg zero u8 g[$r4d] $r2 $p0475sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00476$p1 suldgb u16 $r0 cv zero u8 g[$r4d] $r2 $p0477long mov b32 $r3 0x00000001478long mov b32 $r2 0x00000000479long mov b32 $r1 0x00000000480long ret481// R16_FLOAT482$p1 suldgb u16 $r0 ca zero u8 g[$r4d] $r2 $p0483set $p1 0x1 $p1 xor not $p2484sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00485$p2 suldgb u16 $r0 cg zero u8 g[$r4d] $r2 $p0486$p1 suldgb u16 $r0 cv zero u8 g[$r4d] $r2 $p0487long mov b32 $r3 0x3f800000488long mov b32 $r2 0x00000000489cvt f32 $r0 f16 $r0 0490mov b32 $r1 0x00000000491long ret492// R8_UNORM493sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00494$p1 suldgb u8 $r0 ca zero u8 g[$r4d] $r2 $p0495set $p1 0x1 $p1 xor not $p2496$p2 suldgb u8 $r0 cg zero u8 g[$r4d] $r2 $p0497$p1 suldgb u8 $r0 cv zero u8 g[$r4d] $r2 $p0498mov b32 $r3 0x3f800000499cvt rn f32 $r0 u8 0 $r0500mov b32 $r2 0x00000000501sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00502mul f32 $r0 $r0 0x3b808081503mov b32 $r1 0x00000000504long ret505// R8_SNORM506$p1 suldgb u8 $r0 ca zero u8 g[$r4d] $r2 $p0507set $p1 0x1 $p1 xor not $p2508$p2 suldgb u8 $r0 cg zero u8 g[$r4d] $r2 $p0509$p1 suldgb u8 $r0 cv zero u8 g[$r4d] $r2 $p0510sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00511mov b32 $r3 0x3f800000512cvt rn f32 $r0 s8 0 $r0513mov b32 $r2 0x00000000514mul f32 $r0 $r0 0x3c010204515mov b32 $r1 0x00000000516long ret517// R8_SINT518$p1 suldgb s8 $r0 ca zero u8 g[$r4d] $r2 $p0519sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00520set $p1 0x1 $p1 xor not $p2521$p2 suldgb s8 $r0 cg zero u8 g[$r4d] $r2 $p0522$p1 suldgb s8 $r0 cv zero u8 g[$r4d] $r2 $p0523long mov b32 $r3 0x00000001524long mov b32 $r2 0x00000000525long mov b32 $r1 0x00000000526long ret527// R8_UINT528sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00529$p1 suldgb u8 $r0 ca zero u8 g[$r4d] $r2 $p0530set $p1 0x1 $p1 xor not $p2531$p2 suldgb u8 $r0 cg zero u8 g[$r4d] $r2 $p0532$p1 suldgb u8 $r0 cv zero u8 g[$r4d] $r2 $p0533long mov b32 $r3 0x00000001534long mov b32 $r2 0x00000000535long mov b32 $r1 0x00000000536sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00537long ret538// R11G11B10_FLOAT TODO539$p1 suldgb b32 $r3 ca zero u8 g[$r4d] $r2 $p0540set $p1 0x1 $p1 xor not $p2541$p2 suldgb b32 $r3 cg zero u8 g[$r4d] $r2 $p0542$p1 suldgb b32 $r3 cv zero u8 g[$r4d] $r2 $p0543long mov b32 $r3 0x3f800000544long nop545sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00546long nop547long ret548549550// RCP F64: Newton Raphson reciprocal(x): r_{i+1} = r_i * (2.0 - x * r_i)551//552// INPUT: $r0d (x)553// OUTPUT: $r0d (rcp(x))554// CLOBBER: $r2 - $r7555// SIZE: 9 * 8 bytes556//557gk104_rcp_f64:558// Step 1: classify input according to exponent and value, and calculate559// result for 0/inf/nan. $r2 holds the exponent value, which starts at560// bit 52 (bit 20 of the upper half) and is 11 bits in length561ext u32 $r2 $r1 0xb14562add b32 $r3 $r2 0xffffffff563joinat #rcp_rejoin564// We want to check whether the exponent is 0 or 0x7ff (i.e. NaN, inf,565// denorm, or 0). Do this by subtracting 1 from the exponent, which will566// mean that it's > 0x7fd in those cases when doing unsigned comparison567set $p0 0x1 gt u32 $r3 0x7fd568// $r3: 0 for norms, 0x36 for denorms, -1 for others569long mov b32 $r3 0x0570sched 0x2f 0x04 0x2d 0x2b 0x2f 0x28 0x28571join (not $p0) nop572// Process all special values: NaN, inf, denorm, 0573mov b32 $r3 0xffffffff574// A number is NaN if its abs value is greater than or unordered with inf575set $p0 0x1 gtu f64 abs $r0d 0x7ff0000000000000576(not $p0) bra #rcp_inf_or_denorm_or_zero577// NaN -> NaN, the next line sets the "quiet" bit of the result. This578// behavior is both seen on the CPU and the blob579join or b32 $r1 $r1 0x80000580rcp_inf_or_denorm_or_zero:581and b32 $r4 $r1 0x7ff00000582// Other values with nonzero in exponent field should be inf583set $p0 0x1 eq s32 $r4 0x0584sched 0x2b 0x04 0x2f 0x2d 0x2b 0x2f 0x20585$p0 bra #rcp_denorm_or_zero586// +/-Inf -> +/-0587xor b32 $r1 $r1 0x7ff00000588join mov b32 $r0 0x0589rcp_denorm_or_zero:590set $p0 0x1 gtu f64 abs $r0d 0x0591$p0 bra #rcp_denorm592// +/-0 -> +/-Inf593join or b32 $r1 $r1 0x7ff00000594rcp_denorm:595// non-0 denorms: multiply with 2^54 (the 0x36 in $r3), join with norms596mul rn f64 $r0d $r0d 0x4350000000000000597sched 0x2f 0x28 0x2b 0x28 0x28 0x04 0x28598join mov b32 $r3 0x36599rcp_rejoin:600// All numbers with -1 in $r3 have their result ready in $r0d, return them601// others need further calculation602set $p0 0x1 lt s32 $r3 0x0603$p0 bra #rcp_end604// Step 2: Before the real calculation goes on, renormalize the values to605// range [1, 2) by setting exponent field to 0x3ff (the exponent of 1)606// result in $r6d. The exponent will be recovered later.607ext u32 $r2 $r1 0xb14608and b32 $r7 $r1 0x800fffff609add b32 $r7 $r7 0x3ff00000610long mov b32 $r6 $r0611sched 0x2b 0x04 0x28 0x28 0x2a 0x2b 0x2e612// Step 3: Convert new value to float (no overflow will occur due to step613// 2), calculate rcp and do newton-raphson step once614cvt rz f32 $r5 f64 $r6d615long rcp f32 $r4 $r5616mov b32 $r0 0xbf800000617fma rn f32 $r5 $r4 $r5 $r0618fma rn f32 $r0 neg $r4 $r5 $r4619// Step 4: convert result $r0 back to double, do newton-raphson steps620cvt f64 $r0d f32 $r0621cvt f64 $r6d neg f64 $r6d622sched 0x2e 0x29 0x29 0x29 0x29 0x29 0x29623cvt f64 $r8d f32 0x3f800000624// 4 Newton-Raphson Steps, tmp in $r4d, result in $r0d625// The formula used here (and above) is:626// RCP_{n + 1} = 2 * RCP_{n} - x * RCP_{n} * RCP_{n}627// The following code uses 2 FMAs for each step, and it will basically628// looks like:629// tmp = -src * RCP_{n} + 1630// RCP_{n + 1} = RCP_{n} * tmp + RCP_{n}631fma rn f64 $r4d $r6d $r0d $r8d632fma rn f64 $r0d $r0d $r4d $r0d633fma rn f64 $r4d $r6d $r0d $r8d634fma rn f64 $r0d $r0d $r4d $r0d635fma rn f64 $r4d $r6d $r0d $r8d636fma rn f64 $r0d $r0d $r4d $r0d637sched 0x29 0x20 0x28 0x28 0x28 0x28 0x28638fma rn f64 $r4d $r6d $r0d $r8d639fma rn f64 $r0d $r0d $r4d $r0d640// Step 5: Exponent recovery and final processing641// The exponent is recovered by adding what we added to the exponent.642// Suppose we want to calculate rcp(x), but we have rcp(cx), then643// rcp(x) = c * rcp(cx)644// The delta in exponent comes from two sources:645// 1) The renormalization in step 2. The delta is:646// 0x3ff - $r2647// 2) (For the denorm input) The 2^54 we multiplied at rcp_denorm, stored648// in $r3649// These 2 sources are calculated in the first two lines below, and then650// added to the exponent extracted from the result above.651// Note that after processing, the new exponent may >= 0x7ff (inf)652// or <= 0 (denorm). Those cases will be handled respectively below653subr b32 $r2 $r2 0x3ff654long add b32 $r4 $r2 $r3655ext u32 $r3 $r1 0xb14656// New exponent in $r3657long add b32 $r3 $r3 $r4658add b32 $r2 $r3 0xffffffff659sched 0x28 0x2b 0x28 0x2b 0x28 0x28 0x2b660// (exponent-1) < 0x7fe (unsigned) means the result is in norm range661// (same logic as in step 1)662set $p0 0x1 lt u32 $r2 0x7fe663(not $p0) bra #rcp_result_inf_or_denorm664// Norms: convert exponents back and return665shl b32 $r4 $r4 clamp 0x14666long add b32 $r1 $r4 $r1667bra #rcp_end668rcp_result_inf_or_denorm:669// New exponent >= 0x7ff means that result is inf670set $p0 0x1 ge s32 $r3 0x7ff671(not $p0) bra #rcp_result_denorm672sched 0x20 0x25 0x28 0x2b 0x23 0x25 0x2f673// Infinity674and b32 $r1 $r1 0x80000000675long mov b32 $r0 0x0676add b32 $r1 $r1 0x7ff00000677bra #rcp_end678rcp_result_denorm:679// Denorm result comes from huge input. The greatest possible fp64, i.e.680// 0x7fefffffffffffff's rcp is 0x0004000000000000, 1/4 of the smallest681// normal value. Other rcp result should be greater than that. If we682// set the exponent field to 1, we can recover the result by multiplying683// it with 1/2 or 1/4. 1/2 is used if the "exponent" $r3 is 0, otherwise684// 1/4 ($r3 should be -1 then). This is quite tricky but greatly simplifies685// the logic here.686set $p0 0x1 ne u32 $r3 0x0687and b32 $r1 $r1 0x800fffff688// 0x3e800000: 1/4689$p0 cvt f64 $r6d f32 0x3e800000690sched 0x2f 0x28 0x2c 0x2e 0x2a 0x20 0x27691// 0x3f000000: 1/2692(not $p0) cvt f64 $r6d f32 0x3f000000693add b32 $r1 $r1 0x00100000694mul rn f64 $r0d $r0d $r6d695rcp_end:696long ret697698// RSQ F64: Newton Raphson rsqrt(x): r_{i+1} = r_i * (1.5 - 0.5 * x * r_i * r_i)699//700// INPUT: $r0d (x)701// OUTPUT: $r0d (rsqrt(x))702// CLOBBER: $r2 - $r7703// SIZE: 14 * 8 bytes704//705gk104_rsq_f64:706// Before getting initial result rsqrt64h, two special cases should be707// handled first.708// 1. NaN: set the highest bit in mantissa so it'll be surely recognized709// as NaN in rsqrt64h710set $p0 0x1 gtu f64 abs $r0d 0x7ff0000000000000711$p0 or b32 $r1 $r1 0x00080000712and b32 $r2 $r1 0x7fffffff713sched 0x27 0x20 0x28 0x2c 0x25 0x28 0x28714// 2. denorms and small normal values: using their original value will715// lose precision either at rsqrt64h or the first step in newton-raphson716// steps below. Take 2 as a threshold in exponent field, and multiply717// with 2^54 if the exponent is smaller or equal. (will multiply 2^27718// to recover in the end)719ext u32 $r3 $r1 0xb14720set $p1 0x1 le u32 $r3 0x2721long or b32 $r2 $r0 $r2722$p1 mul rn f64 $r0d $r0d 0x4350000000000000723rsqrt64h $r5 $r1724// rsqrt64h will give correct result for 0/inf/nan, the following logic725// checks whether the input is one of those (exponent is 0x7ff or all 0726// except for the sign bit)727set b32 $r6 ne u32 $r3 0x7ff728long and b32 $r2 $r2 $r6729sched 0x28 0x2b 0x20 0x27 0x28 0x2e 0x28730set $p0 0x1 ne u32 $r2 0x0731$p0 bra #rsq_norm732// For 0/inf/nan, make sure the sign bit agrees with input and return733and b32 $r1 $r1 0x80000000734long mov b32 $r0 0x0735long or b32 $r1 $r1 $r5736long ret737rsq_norm:738// For others, do 4 Newton-Raphson steps with the formula:739// RSQ_{n + 1} = RSQ_{n} * (1.5 - 0.5 * x * RSQ_{n} * RSQ_{n})740// In the code below, each step is written as:741// tmp1 = 0.5 * x * RSQ_{n}742// tmp2 = -RSQ_{n} * tmp1 + 0.5743// RSQ_{n + 1} = RSQ_{n} * tmp2 + RSQ_{n}744long mov b32 $r4 0x0745sched 0x2f 0x29 0x29 0x29 0x29 0x29 0x29746// 0x3f000000: 1/2747cvt f64 $r8d f32 0x3f000000748mul rn f64 $r2d $r0d $r8d749mul rn f64 $r0d $r2d $r4d750fma rn f64 $r6d neg $r4d $r0d $r8d751fma rn f64 $r4d $r4d $r6d $r4d752mul rn f64 $r0d $r2d $r4d753fma rn f64 $r6d neg $r4d $r0d $r8d754sched 0x29 0x29 0x29 0x29 0x29 0x29 0x29755fma rn f64 $r4d $r4d $r6d $r4d756mul rn f64 $r0d $r2d $r4d757fma rn f64 $r6d neg $r4d $r0d $r8d758fma rn f64 $r4d $r4d $r6d $r4d759mul rn f64 $r0d $r2d $r4d760fma rn f64 $r6d neg $r4d $r0d $r8d761fma rn f64 $r4d $r4d $r6d $r4d762sched 0x29 0x20 0x28 0x2e 0x00 0x00 0x00763// Multiply 2^27 to result for small inputs to recover764$p1 mul rn f64 $r4d $r4d 0x41a0000000000000765long mov b32 $r1 $r5766long mov b32 $r0 $r4767long ret768769//770// Trap handler.771// Requires at least 4 GPRs and 32 bytes of l[] memory to temporarily save GPRs.772// Low 32 bytes of l[] memory shouldn't be used if resumability is required.773//774// Trap info:775// 0x000: mutex776// 0x004: PC777// 0x008: trapstat778// 0x00c: warperr779// 0x010: tidx780// 0x014: tidy781// 0x018: tidz782// 0x01c: ctaidx783// 0x020: ctaidy784// 0x024: ctaidz785// 0x030: $r0q786// 0x130: $flags787// 0x140: s[]788//789st b128 wb l[0x00] $r0q790// check state of the warp and continue if it didn't cause the trap791long mov b32 $r1 $trapstat792long mov b32 $r3 $warperr793mov $r2 $flags mask 0xffff794and b32 0 $c $r1 $r3795e $c bra #end_cont796// spill control flow stack to l[]797long mov b32 $r3 16798spill_cfstack:799preret #end_exit800sub b32 $r3 $c $r3 0x1801lg $c bra #spill_cfstack802// retrieve pointer to trap info803mov b32 $r0 c0[0x1900]804mov b32 $r1 c0[0x1904]805// we only let a single faulting thread store its state806mov b32 $r3 0x1807exch b32 $r3 g[$r0d] $r3808joinat #end_exit809set $p0 0x1 eq u32 $r3 0x1810join $p0 nop811// store $c and $p registers812st b32 wb g[$r0d+0x130] $r2813// store $trapstat and $warperr814long mov b32 $r2 $trapstat815long mov b32 $r3 $warperr816st b64 wb g[$r0d+0x8] $r2d817// store registers818st b128 wb g[$r0d+0x40] $r4q819st b128 wb g[$r0d+0x50] $r8q820st b128 wb g[$r0d+0x60] $r12q821st b128 wb g[$r0d+0x70] $r16q822st b128 wb g[$r0d+0x80] $r20q823st b128 wb g[$r0d+0x90] $r24q824st b128 wb g[$r0d+0xa0] $r28q825st b128 wb g[$r0d+0xb0] $r32q826st b128 wb g[$r0d+0xc0] $r36q827st b128 wb g[$r0d+0xd0] $r40q828st b128 wb g[$r0d+0xe0] $r44q829st b128 wb g[$r0d+0xf0] $r48q830st b128 wb g[$r0d+0x100] $r52q831st b128 wb g[$r0d+0x110] $r56q832st b128 wb g[$r0d+0x120] $r60q833ld b64 $r2d cs l[0x0]834st b64 wb g[$r0d+0x30] $r2d835ld b64 $r2d cs l[0x8]836st b64 wb g[$r0d+0x38] $r2d837// store thread id838long mov b32 $r2 $tidx839long mov b32 $r3 $tidy840st b64 wb g[$r0d+0x10] $r2d841long mov b32 $r2 $tidz842long mov b32 $r3 $ctaidx843st b64 wb g[$r0d+0x18] $r2d844long mov b32 $r2 $ctaidy845long mov b32 $r3 $ctaidz846st b64 wb g[$r0d+0x20] $r2d847// store shared memory (in reverse order so $r0d is base again at the end)848long mov b32 $r3 $smemsz849sub b32 $r3 $c $r3 0x4850s $c bra #shared_done851add b32 $r0 $c $r0 $r3852add b32 $r1 $r1 0x0 $c853shared_loop:854long ld b32 $r2 s[$r3]855long st b32 wb g[$r0d+0x140] $r2856sub b32 $r0 $c $r0 0x4857sub b32 $r1 $r1 0x0 $c858sub b32 $r3 $c $r3 0x4859lg $c bra #shared_loop860shared_done:861// search the stack for trap entry to retrieve PC862mov b32 $r0 c0[0x1908]863mov b32 $r1 c0[0x190c]864membar sys865// invalidate caches so we can read stack entries via g[]866cctl ivall 0 l[0]867cctl ivall 0 g[$r0d]868// get offsets869mov b32 $r2 $physid870ext u32 $r3 $r2 0x0814 // MP id871ext u32 $r2 $r2 0x0608 // warp id872mul $r2 u32 $r2 u32 c0[0x1914] // warp offset873mul $r3 u32 $r3 u32 c0[0x1910] // MP offset874add b32 $r2 $r2 $r3 // MP + warp offset875add b32 $r0 $c $r0 $r2876add b32 $r1 $r1 0x0 $c877search_cstack:878mov b32 $r3 c0[0x1918] // cstack size879ld u8 $r2 cv g[$r0d+0x8]880set $p0 0x1 eq u32 $r2 0xa881$p0 bra #entry_found882add b32 $r0 $c $r0 0x10883add b32 $r1 $r1 0x0 $c884sub b32 $r3 $c $r3 0x10885lg $c bra #search_cstack886bra #end_exit887entry_found:888// load PC (may be unaligned and spread out)889ld b32 $r2 cv g[$r0d]890mov b32 $r0 c0[0x1900]891mov b32 $r1 c0[0x1904]892st b32 wb g[$r0d+0x4] $r2893join nop894// invalidate caches and exit895end_exit:896cctl ivall 0 g[0]897bpt pause 0x0898rtt terminate899end_cont:900bpt pause 0x0901mov $flags $r2 mask 0xffff902ld b128 $r0q cs l[0x00]903rtt904905.section #gk104_builtin_offsets906.b64 #gk104_div_u32907.b64 #gk104_div_s32908.b64 #gk104_rcp_f64909.b64 #gk104_rsq_f64910911912