Path: blob/21.2-virgl/src/gallium/drivers/nouveau/codegen/lib/gm107.asm
4574 views
.section #gm107_builtin_code1// DIV U322//3// UNR recurrence (q = a / b):4// look for z such that 2^32 - b <= b * z < 2^325// then q - 1 <= (a * z) / 2^32 <= q6//7// INPUT: $r0: dividend, $r1: divisor8// OUTPUT: $r0: result, $r1: modulus9// CLOBBER: $r2 - $r3, $p0 - $p110// SIZE: 22 / 14 * 8 bytes11//12gm107_div_u32:13sched (st 0xd wr 0x0 wt 0x3f) (st 0x1 wt 0x1) (st 0x6)14flo u32 $r2 $r115lop xor 1 $r2 $r2 0x1f16mov $r3 0x1 0xf17sched (st 0x1) (st 0xf wr 0x0) (st 0x6 wr 0x0 wt 0x1)18shl $r2 $r3 $r219i2i u32 u32 $r1 neg $r120imul u32 u32 $r3 $r1 $r221sched (st 0x6 wr 0x0 wt 0x1) (st 0x6 wr 0x0 wt 0x1) (st 0x6 wr 0x0 wt 0x1)22imad u32 u32 hi $r2 $r2 $r3 $r223imul u32 u32 $r3 $r1 $r224imad u32 u32 hi $r2 $r2 $r3 $r225sched (st 0x6 wr 0x0 wt 0x1) (st 0x6 wr 0x0 wt 0x1) (st 0x6 wr 0x0 wt 0x1)26imul u32 u32 $r3 $r1 $r227imad u32 u32 hi $r2 $r2 $r3 $r228imul u32 u32 $r3 $r1 $r229sched (st 0x6 wr 0x0 wt 0x1) (st 0x6 wr 0x0 wt 0x1) (st 0x6 wr 0x0 rd 0x1 wt 0x1)30imad u32 u32 hi $r2 $r2 $r3 $r231imul u32 u32 $r3 $r1 $r232imad u32 u32 hi $r2 $r2 $r3 $r233sched (st 0x6 wt 0x2) (st 0x6 wr 0x0 rd 0x1 wt 0x1) (st 0xf wr 0x0 rd 0x1 wt 0x2)34mov $r3 $r0 0xf35imul u32 u32 hi $r0 $r0 $r236i2i u32 u32 $r2 neg $r137sched (st 0x6 wr 0x0 wt 0x3) (st 0xd wt 0x1) (st 0x1)38imad u32 u32 $r1 $r1 $r0 $r339isetp ge u32 and $p0 1 $r1 $r2 140$p0 iadd $r1 $r1 neg $r241sched (st 0x5) (st 0xd) (st 0x1)42$p0 iadd $r0 $r0 0x143$p0 isetp ge u32 and $p0 1 $r1 $r2 144$p0 iadd $r1 $r1 neg $r245sched (st 0x1) (st 0xf) (st 0xf)46$p0 iadd $r0 $r0 0x147ret48nop 04950// DIV S32, like DIV U32 after taking ABS(inputs)51//52// INPUT: $r0: dividend, $r1: divisor53// OUTPUT: $r0: result, $r1: modulus54// CLOBBER: $r2 - $r3, $p0 - $p355//56gm107_div_s32:57sched (st 0xd wt 0x3f) (st 0x1) (st 0x1 wr 0x0)58isetp lt and $p2 0x1 $r0 0 159isetp lt xor $p3 1 $r1 0 $p260i2i s32 s32 $r0 abs $r061sched (st 0xf wr 0x1) (st 0xd wr 0x1 wt 0x2) (st 0x1 wt 0x2)62i2i s32 s32 $r1 abs $r163flo u32 $r2 $r164lop xor 1 $r2 $r2 0x1f65sched (st 0x6) (st 0x1) (st 0xf wr 0x1)66mov $r3 0x1 0xf67shl $r2 $r3 $r268i2i u32 u32 $r1 neg $r169sched (st 0x6 wr 0x1 wt 0x2) (st 0x6 wr 0x1 wt 0x2) (st 0x6 wr 0x1 wt 0x2)70imul u32 u32 $r3 $r1 $r271imad u32 u32 hi $r2 $r2 $r3 $r272imul u32 u32 $r3 $r1 $r273sched (st 0x6 wr 0x1 wt 0x2) (st 0x6 wr 0x1 wt 0x2) (st 0x6 wr 0x1 wt 0x2)74imad u32 u32 hi $r2 $r2 $r3 $r275imul u32 u32 $r3 $r1 $r276imad u32 u32 hi $r2 $r2 $r3 $r277sched (st 0x6 wr 0x1 wt 0x2) (st 0x6 wr 0x1 wt 0x2) (st 0x6 wr 0x1 wt 0x2)78imul u32 u32 $r3 $r1 $r279imad u32 u32 hi $r2 $r2 $r3 $r280imul u32 u32 $r3 $r1 $r281sched (st 0x6 wr 0x1 rd 0x2 wt 0x2) (st 0x2 wt 0x5) (st 0x6 wr 0x0 rd 0x1 wt 0x2)82imad u32 u32 hi $r2 $r2 $r3 $r283mov $r3 $r0 0xf84imul u32 u32 hi $r0 $r0 $r285sched (st 0xf wr 0x1 rd 0x2 wt 0x2) (st 0x6 wr 0x0 wt 0x5) (st 0xd wt 0x3)86i2i u32 u32 $r2 neg $r187imad u32 u32 $r1 $r1 $r0 $r388isetp ge u32 and $p0 1 $r1 $r2 189sched (st 0x1) (st 0x5) (st 0xd)90$p0 iadd $r1 $r1 neg $r291$p0 iadd $r0 $r0 0x192$p0 isetp ge u32 and $p0 1 $r1 $r2 193sched (st 0x1) (st 0x2) (st 0xf wr 0x0)94$p0 iadd $r1 $r1 neg $r295$p0 iadd $r0 $r0 0x196$p3 i2i s32 s32 $r0 neg $r097sched (st 0xf wr 0x1) (st 0xf wt 0x3) (st 0xf)98$p2 i2i s32 s32 $r1 neg $r199ret100nop 0101102// RCP F64103//104// INPUT: $r0d105// OUTPUT: $r0d106// CLOBBER: $r2 - $r9, $p0107//108// The core of RCP and RSQ implementation is Newton-Raphson step, which is109// used to find successively better approximation from an imprecise initial110// value (single precision rcp in RCP and rsqrt64h in RSQ).111//112gm107_rcp_f64:113// Step 1: classify input according to exponent and value, and calculate114// result for 0/inf/nan. $r2 holds the exponent value, which starts at115// bit 52 (bit 20 of the upper half) and is 11 bits in length116sched (st 0x0) (st 0x0) (st 0x0)117bfe u32 $r2 $r1 0xb14118iadd32i $r3 $r2 -1119ssy #rcp_rejoin120// We want to check whether the exponent is 0 or 0x7ff (i.e. NaN, inf,121// denorm, or 0). Do this by subtracting 1 from the exponent, which will122// mean that it's > 0x7fd in those cases when doing unsigned comparison123sched (st 0x0) (st 0x0) (st 0x0)124isetp gt u32 and $p0 1 $r3 0x7fd 1125// $r3: 0 for norms, 0x36 for denorms, -1 for others126mov $r3 0x0 0xf127not $p0 sync128// Process all special values: NaN, inf, denorm, 0129sched (st 0x0) (st 0x0) (st 0x0)130mov32i $r3 0xffffffff 0xf131// A number is NaN if its abs value is greater than or unordered with inf132dsetp gtu and $p0 1 abs $r0 0x7ff0000000000000 1133not $p0 bra #rcp_inf_or_denorm_or_zero134// NaN -> NaN, the next line sets the "quiet" bit of the result. This135// behavior is both seen on the CPU and the blob136sched (st 0x0) (st 0x0) (st 0x0)137lop32i or $r1 $r1 0x80000138sync139rcp_inf_or_denorm_or_zero:140lop32i and $r4 $r1 0x7ff00000141sched (st 0x0) (st 0x0) (st 0x0)142// Other values with nonzero in exponent field should be inf143isetp eq and $p0 1 $r4 0x0 1144$p0 bra #rcp_denorm_or_zero145// +/-Inf -> +/-0146lop32i xor $r1 $r1 0x7ff00000147sched (st 0x0) (st 0x0) (st 0x0)148mov $r0 0x0 0xf149sync150rcp_denorm_or_zero:151dsetp gtu and $p0 1 abs $r0 0x0 1152sched (st 0x0) (st 0x0) (st 0x0)153$p0 bra #rcp_denorm154// +/-0 -> +/-Inf155lop32i or $r1 $r1 0x7ff00000156sync157rcp_denorm:158// non-0 denorms: multiply with 2^54 (the 0x36 in $r3), join with norms159sched (st 0x0) (st 0x0) (st 0x0)160dmul $r0 $r0 0x4350000000000000161mov $r3 0x36 0xf162sync163rcp_rejoin:164// All numbers with -1 in $r3 have their result ready in $r0d, return them165// others need further calculation166sched (st 0x0) (st 0x0) (st 0x0)167isetp lt and $p0 1 $r3 0x0 1168$p0 bra #rcp_end169// Step 2: Before the real calculation goes on, renormalize the values to170// range [1, 2) by setting exponent field to 0x3ff (the exponent of 1)171// result in $r6d. The exponent will be recovered later.172bfe u32 $r2 $r1 0xb14173sched (st 0x0) (st 0x0) (st 0x0)174lop32i and $r7 $r1 0x800fffff175iadd32i $r7 $r7 0x3ff00000176mov $r6 $r0 0xf177// Step 3: Convert new value to float (no overflow will occur due to step178// 2), calculate rcp and do newton-raphson step once179sched (st 0x0) (st 0x0) (st 0x0)180f2f ftz f64 f32 $r5 $r6181mufu rcp $r4 $r5182mov32i $r0 0xbf800000 0xf183sched (st 0x0) (st 0x0) (st 0x0)184ffma $r5 $r4 $r5 $r0185ffma $r0 $r5 neg $r4 $r4186// Step 4: convert result $r0 back to double, do newton-raphson steps187f2f f32 f64 $r0 $r0188sched (st 0x0) (st 0x0) (st 0x0)189f2f f64 f64 $r6 neg $r6190f2f f32 f64 $r8 0x3f800000191// 4 Newton-Raphson Steps, tmp in $r4d, result in $r0d192// The formula used here (and above) is:193// RCP_{n + 1} = 2 * RCP_{n} - x * RCP_{n} * RCP_{n}194// The following code uses 2 FMAs for each step, and it will basically195// looks like:196// tmp = -src * RCP_{n} + 1197// RCP_{n + 1} = RCP_{n} * tmp + RCP_{n}198dfma $r4 $r6 $r0 $r8199sched (st 0x0) (st 0x0) (st 0x0)200dfma $r0 $r0 $r4 $r0201dfma $r4 $r6 $r0 $r8202dfma $r0 $r0 $r4 $r0203sched (st 0x0) (st 0x0) (st 0x0)204dfma $r4 $r6 $r0 $r8205dfma $r0 $r0 $r4 $r0206dfma $r4 $r6 $r0 $r8207sched (st 0x0) (st 0x0) (st 0x0)208dfma $r0 $r0 $r4 $r0209// Step 5: Exponent recovery and final processing210// The exponent is recovered by adding what we added to the exponent.211// Suppose we want to calculate rcp(x), but we have rcp(cx), then212// rcp(x) = c * rcp(cx)213// The delta in exponent comes from two sources:214// 1) The renormalization in step 2. The delta is:215// 0x3ff - $r2216// 2) (For the denorm input) The 2^54 we multiplied at rcp_denorm, stored217// in $r3218// These 2 sources are calculated in the first two lines below, and then219// added to the exponent extracted from the result above.220// Note that after processing, the new exponent may >= 0x7ff (inf)221// or <= 0 (denorm). Those cases will be handled respectively below222iadd $r2 neg $r2 0x3ff223iadd $r4 $r2 $r3224sched (st 0x0) (st 0x0) (st 0x0)225bfe u32 $r3 $r1 0xb14226// New exponent in $r3227iadd $r3 $r3 $r4228iadd32i $r2 $r3 -1229// (exponent-1) < 0x7fe (unsigned) means the result is in norm range230// (same logic as in step 1)231sched (st 0x0) (st 0x0) (st 0x0)232isetp lt u32 and $p0 1 $r2 0x7fe 1233not $p0 bra #rcp_result_inf_or_denorm234// Norms: convert exponents back and return235shl $r4 $r4 0x14236sched (st 0x0) (st 0x0) (st 0x0)237iadd $r1 $r4 $r1238bra #rcp_end239rcp_result_inf_or_denorm:240// New exponent >= 0x7ff means that result is inf241isetp ge and $p0 1 $r3 0x7ff 1242sched (st 0x0) (st 0x0) (st 0x0)243not $p0 bra #rcp_result_denorm244// Infinity245lop32i and $r1 $r1 0x80000000246mov $r0 0x0 0xf247sched (st 0x0) (st 0x0) (st 0x0)248iadd32i $r1 $r1 0x7ff00000249bra #rcp_end250rcp_result_denorm:251// Denorm result comes from huge input. The greatest possible fp64, i.e.252// 0x7fefffffffffffff's rcp is 0x0004000000000000, 1/4 of the smallest253// normal value. Other rcp result should be greater than that. If we254// set the exponent field to 1, we can recover the result by multiplying255// it with 1/2 or 1/4. 1/2 is used if the "exponent" $r3 is 0, otherwise256// 1/4 ($r3 should be -1 then). This is quite tricky but greatly simplifies257// the logic here.258isetp ne u32 and $p0 1 $r3 0x0 1259sched (st 0x0) (st 0x0) (st 0x0)260lop32i and $r1 $r1 0x800fffff261// 0x3e800000: 1/4262$p0 f2f f32 f64 $r6 0x3e800000263// 0x3f000000: 1/2264not $p0 f2f f32 f64 $r6 0x3f000000265sched (st 0x0) (st 0x0) (st 0x0)266iadd32i $r1 $r1 0x00100000267dmul $r0 $r0 $r6268rcp_end:269ret270271// RSQ F64272//273// INPUT: $r0d274// OUTPUT: $r0d275// CLOBBER: $r2 - $r9, $p0 - $p1276//277gm107_rsq_f64:278// Before getting initial result rsqrt64h, two special cases should be279// handled first.280// 1. NaN: set the highest bit in mantissa so it'll be surely recognized281// as NaN in rsqrt64h282sched (st 0xd wr 0x0 wt 0x3f) (st 0xd wt 0x1) (st 0xd)283dsetp gtu and $p0 1 abs $r0 0x7ff0000000000000 1284$p0 lop32i or $r1 $r1 0x00080000285lop32i and $r2 $r1 0x7fffffff286// 2. denorms and small normal values: using their original value will287// lose precision either at rsqrt64h or the first step in newton-raphson288// steps below. Take 2 as a threshold in exponent field, and multiply289// with 2^54 if the exponent is smaller or equal. (will multiply 2^27290// to recover in the end)291sched (st 0xd) (st 0xd) (st 0xd)292bfe u32 $r3 $r1 0xb14293isetp le u32 and $p1 1 $r3 0x2 1294lop or 1 $r2 $r0 $r2295sched (st 0xd wr 0x0) (st 0xd wr 0x0 wt 0x1) (st 0xd)296$p1 dmul $r0 $r0 0x4350000000000000297mufu rsq64h $r5 $r1298// rsqrt64h will give correct result for 0/inf/nan, the following logic299// checks whether the input is one of those (exponent is 0x7ff or all 0300// except for the sign bit)301iset ne u32 and $r6 $r3 0x7ff 1302sched (st 0xd) (st 0xd) (st 0xd)303lop and 1 $r2 $r2 $r6304isetp ne u32 and $p0 1 $r2 0x0 1305$p0 bra #rsq_norm306// For 0/inf/nan, make sure the sign bit agrees with input and return307sched (st 0xd) (st 0xd) (st 0xd wt 0x1)308lop32i and $r1 $r1 0x80000000309mov $r0 0x0 0xf310lop or 1 $r1 $r1 $r5311sched (st 0xd) (st 0xf) (st 0xf)312ret313nop 0314nop 0315rsq_norm:316// For others, do 4 Newton-Raphson steps with the formula:317// RSQ_{n + 1} = RSQ_{n} * (1.5 - 0.5 * x * RSQ_{n} * RSQ_{n})318// In the code below, each step is written as:319// tmp1 = 0.5 * x * RSQ_{n}320// tmp2 = -RSQ_{n} * tmp1 + 0.5321// RSQ_{n + 1} = RSQ_{n} * tmp2 + RSQ_{n}322sched (st 0xd) (st 0xd wr 0x1) (st 0xd wr 0x1 rd 0x0 wt 0x3)323mov $r4 0x0 0xf324// 0x3f000000: 1/2325f2f f32 f64 $r8 0x3f000000326dmul $r2 $r0 $r8327sched (st 0xd wr 0x0 wt 0x3) (st 0xd wr 0x0 wt 0x1) (st 0xd wr 0x0 wt 0x1)328dmul $r0 $r2 $r4329dfma $r6 $r0 neg $r4 $r8330dfma $r4 $r4 $r6 $r4331sched (st 0xd wr 0x0 wt 0x1) (st 0xd wr 0x0 wt 0x1) (st 0xd wr 0x0 wt 0x1)332dmul $r0 $r2 $r4333dfma $r6 $r0 neg $r4 $r8334dfma $r4 $r4 $r6 $r4335sched (st 0xd wr 0x0 wt 0x1) (st 0xd wr 0x0 wt 0x1) (st 0xd wr 0x0 wt 0x1)336dmul $r0 $r2 $r4337dfma $r6 $r0 neg $r4 $r8338dfma $r4 $r4 $r6 $r4339sched (st 0xd wr 0x0 wt 0x1) (st 0xd wr 0x0 wt 0x1) (st 0xd wr 0x0 wt 0x1)340dmul $r0 $r2 $r4341dfma $r6 $r0 neg $r4 $r8342dfma $r4 $r4 $r6 $r4343// Multiply 2^27 to result for small inputs to recover344sched (st 0xd wr 0x0 wt 0x1) (st 0xd wt 0x1) (st 0xd)345$p1 dmul $r4 $r4 0x41a0000000000000346mov $r1 $r5 0xf347mov $r0 $r4 0xf348sched (st 0xd) (st 0xf) (st 0xf)349ret350nop 0351nop 0352353.section #gm107_builtin_offsets354.b64 #gm107_div_u32355.b64 #gm107_div_s32356.b64 #gm107_rcp_f64357.b64 #gm107_rsq_f64358359360