Path: blob/21.2-virgl/src/gallium/drivers/swr/rasterizer/jitter/blend_jit.cpp
4574 views
/****************************************************************************1* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.2*3* Permission is hereby granted, free of charge, to any person obtaining a4* copy of this software and associated documentation files (the "Software"),5* to deal in the Software without restriction, including without limitation6* the rights to use, copy, modify, merge, publish, distribute, sublicense,7* and/or sell copies of the Software, and to permit persons to whom the8* Software is furnished to do so, subject to the following conditions:9*10* The above copyright notice and this permission notice (including the next11* paragraph) shall be included in all copies or substantial portions of the12* Software.13*14* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR15* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,16* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL17* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER18* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING19* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS20* IN THE SOFTWARE.21*22* @file blend_jit.cpp23*24* @brief Implementation of the blend jitter25*26* Notes:27*28******************************************************************************/29#include "jit_pch.hpp"30#include "builder.h"31#include "jit_api.h"32#include "blend_jit.h"33#include "gen_state_llvm.h"34#include "functionpasses/passes.h"3536#include "util/compiler.h"3738// components with bit-widths <= the QUANTIZE_THRESHOLD will be quantized39#define QUANTIZE_THRESHOLD 24041using namespace llvm;42using namespace SwrJit;4344//////////////////////////////////////////////////////////////////////////45/// Interface to Jitting a blend shader46//////////////////////////////////////////////////////////////////////////47struct BlendJit : public Builder48{49BlendJit(JitManager* pJitMgr) : Builder(pJitMgr){};5051template <bool Color, bool Alpha>52void GenerateBlendFactor(SWR_BLEND_FACTOR factor,53Value* constColor[4],54Value* src[4],55Value* src1[4],56Value* dst[4],57Value* result[4])58{59Value* out[4];6061switch (factor)62{63case BLENDFACTOR_ONE:64out[0] = out[1] = out[2] = out[3] = VIMMED1(1.0f);65break;66case BLENDFACTOR_SRC_COLOR:67out[0] = src[0];68out[1] = src[1];69out[2] = src[2];70out[3] = src[3];71break;72case BLENDFACTOR_SRC_ALPHA:73out[0] = out[1] = out[2] = out[3] = src[3];74break;75case BLENDFACTOR_DST_ALPHA:76out[0] = out[1] = out[2] = out[3] = dst[3];77break;78case BLENDFACTOR_DST_COLOR:79out[0] = dst[0];80out[1] = dst[1];81out[2] = dst[2];82out[3] = dst[3];83break;84case BLENDFACTOR_SRC_ALPHA_SATURATE:85out[0] = out[1] = out[2] = VMINPS(src[3], FSUB(VIMMED1(1.0f), dst[3]));86out[3] = VIMMED1(1.0f);87break;88case BLENDFACTOR_CONST_COLOR:89out[0] = constColor[0];90out[1] = constColor[1];91out[2] = constColor[2];92out[3] = constColor[3];93break;94case BLENDFACTOR_CONST_ALPHA:95out[0] = out[1] = out[2] = out[3] = constColor[3];96break;97case BLENDFACTOR_SRC1_COLOR:98out[0] = src1[0];99out[1] = src1[1];100out[2] = src1[2];101out[3] = src1[3];102break;103case BLENDFACTOR_SRC1_ALPHA:104out[0] = out[1] = out[2] = out[3] = src1[3];105break;106case BLENDFACTOR_ZERO:107out[0] = out[1] = out[2] = out[3] = VIMMED1(0.0f);108break;109case BLENDFACTOR_INV_SRC_COLOR:110out[0] = FSUB(VIMMED1(1.0f), src[0]);111out[1] = FSUB(VIMMED1(1.0f), src[1]);112out[2] = FSUB(VIMMED1(1.0f), src[2]);113out[3] = FSUB(VIMMED1(1.0f), src[3]);114break;115case BLENDFACTOR_INV_SRC_ALPHA:116out[0] = out[1] = out[2] = out[3] = FSUB(VIMMED1(1.0f), src[3]);117break;118case BLENDFACTOR_INV_DST_ALPHA:119out[0] = out[1] = out[2] = out[3] = FSUB(VIMMED1(1.0f), dst[3]);120break;121case BLENDFACTOR_INV_DST_COLOR:122out[0] = FSUB(VIMMED1(1.0f), dst[0]);123out[1] = FSUB(VIMMED1(1.0f), dst[1]);124out[2] = FSUB(VIMMED1(1.0f), dst[2]);125out[3] = FSUB(VIMMED1(1.0f), dst[3]);126break;127case BLENDFACTOR_INV_CONST_COLOR:128out[0] = FSUB(VIMMED1(1.0f), constColor[0]);129out[1] = FSUB(VIMMED1(1.0f), constColor[1]);130out[2] = FSUB(VIMMED1(1.0f), constColor[2]);131out[3] = FSUB(VIMMED1(1.0f), constColor[3]);132break;133case BLENDFACTOR_INV_CONST_ALPHA:134out[0] = out[1] = out[2] = out[3] = FSUB(VIMMED1(1.0f), constColor[3]);135break;136case BLENDFACTOR_INV_SRC1_COLOR:137out[0] = FSUB(VIMMED1(1.0f), src1[0]);138out[1] = FSUB(VIMMED1(1.0f), src1[1]);139out[2] = FSUB(VIMMED1(1.0f), src1[2]);140out[3] = FSUB(VIMMED1(1.0f), src1[3]);141break;142case BLENDFACTOR_INV_SRC1_ALPHA:143out[0] = out[1] = out[2] = out[3] = FSUB(VIMMED1(1.0f), src1[3]);144break;145default:146SWR_INVALID("Unsupported blend factor: %d", factor);147out[0] = out[1] = out[2] = out[3] = VIMMED1(0.0f);148break;149}150151if (Color)152{153result[0] = out[0];154result[1] = out[1];155result[2] = out[2];156}157158if (Alpha)159{160result[3] = out[3];161}162}163164void Clamp(SWR_FORMAT format, Value* src[4])165{166const SWR_FORMAT_INFO& info = GetFormatInfo(format);167SWR_TYPE type = info.type[0];168169switch (type)170{171default:172break;173174case SWR_TYPE_UNORM:175src[0] = VMINPS(VMAXPS(src[0], VIMMED1(0.0f)), VIMMED1(1.0f));176src[1] = VMINPS(VMAXPS(src[1], VIMMED1(0.0f)), VIMMED1(1.0f));177src[2] = VMINPS(VMAXPS(src[2], VIMMED1(0.0f)), VIMMED1(1.0f));178src[3] = VMINPS(VMAXPS(src[3], VIMMED1(0.0f)), VIMMED1(1.0f));179break;180181case SWR_TYPE_SNORM:182src[0] = VMINPS(VMAXPS(src[0], VIMMED1(-1.0f)), VIMMED1(1.0f));183src[1] = VMINPS(VMAXPS(src[1], VIMMED1(-1.0f)), VIMMED1(1.0f));184src[2] = VMINPS(VMAXPS(src[2], VIMMED1(-1.0f)), VIMMED1(1.0f));185src[3] = VMINPS(VMAXPS(src[3], VIMMED1(-1.0f)), VIMMED1(1.0f));186break;187188case SWR_TYPE_UNKNOWN:189SWR_INVALID("Unsupported format type: %d", type);190}191}192193void ApplyDefaults(SWR_FORMAT format, Value* src[4])194{195const SWR_FORMAT_INFO& info = GetFormatInfo(format);196197bool valid[] = {false, false, false, false};198for (uint32_t c = 0; c < info.numComps; ++c)199{200valid[info.swizzle[c]] = true;201}202203for (uint32_t c = 0; c < 4; ++c)204{205if (!valid[c])206{207src[c] = BITCAST(VIMMED1((int)info.defaults[c]), mSimdFP32Ty);208}209}210}211212void ApplyUnusedDefaults(SWR_FORMAT format, Value* src[4])213{214const SWR_FORMAT_INFO& info = GetFormatInfo(format);215216for (uint32_t c = 0; c < info.numComps; ++c)217{218if (info.type[c] == SWR_TYPE_UNUSED)219{220src[info.swizzle[c]] =221BITCAST(VIMMED1((int)info.defaults[info.swizzle[c]]), mSimdFP32Ty);222}223}224}225226void Quantize(SWR_FORMAT format, Value* src[4])227{228const SWR_FORMAT_INFO& info = GetFormatInfo(format);229for (uint32_t c = 0; c < info.numComps; ++c)230{231if (info.bpc[c] <= QUANTIZE_THRESHOLD && info.type[c] != SWR_TYPE_UNUSED)232{233uint32_t swizComp = info.swizzle[c];234float factor = (float)((1 << info.bpc[c]) - 1);235switch (info.type[c])236{237case SWR_TYPE_UNORM:238src[swizComp] = FADD(FMUL(src[swizComp], VIMMED1(factor)), VIMMED1(0.5f));239src[swizComp] = VROUND(src[swizComp], C(_MM_FROUND_TO_ZERO));240src[swizComp] = FMUL(src[swizComp], VIMMED1(1.0f / factor));241break;242default:243SWR_INVALID("Unsupported format type: %d", info.type[c]);244}245}246}247}248249template <bool Color, bool Alpha>250void BlendFunc(SWR_BLEND_OP blendOp,251Value* src[4],252Value* srcFactor[4],253Value* dst[4],254Value* dstFactor[4],255Value* result[4])256{257Value* out[4];258Value* srcBlend[4];259Value* dstBlend[4];260for (uint32_t i = 0; i < 4; ++i)261{262srcBlend[i] = FMUL(src[i], srcFactor[i]);263dstBlend[i] = FMUL(dst[i], dstFactor[i]);264}265266switch (blendOp)267{268case BLENDOP_ADD:269out[0] = FADD(srcBlend[0], dstBlend[0]);270out[1] = FADD(srcBlend[1], dstBlend[1]);271out[2] = FADD(srcBlend[2], dstBlend[2]);272out[3] = FADD(srcBlend[3], dstBlend[3]);273break;274275case BLENDOP_SUBTRACT:276out[0] = FSUB(srcBlend[0], dstBlend[0]);277out[1] = FSUB(srcBlend[1], dstBlend[1]);278out[2] = FSUB(srcBlend[2], dstBlend[2]);279out[3] = FSUB(srcBlend[3], dstBlend[3]);280break;281282case BLENDOP_REVSUBTRACT:283out[0] = FSUB(dstBlend[0], srcBlend[0]);284out[1] = FSUB(dstBlend[1], srcBlend[1]);285out[2] = FSUB(dstBlend[2], srcBlend[2]);286out[3] = FSUB(dstBlend[3], srcBlend[3]);287break;288289case BLENDOP_MIN:290out[0] = VMINPS(src[0], dst[0]);291out[1] = VMINPS(src[1], dst[1]);292out[2] = VMINPS(src[2], dst[2]);293out[3] = VMINPS(src[3], dst[3]);294break;295296case BLENDOP_MAX:297out[0] = VMAXPS(src[0], dst[0]);298out[1] = VMAXPS(src[1], dst[1]);299out[2] = VMAXPS(src[2], dst[2]);300out[3] = VMAXPS(src[3], dst[3]);301break;302303default:304SWR_INVALID("Unsupported blend operation: %d", blendOp);305out[0] = out[1] = out[2] = out[3] = VIMMED1(0.0f);306break;307}308309if (Color)310{311result[0] = out[0];312result[1] = out[1];313result[2] = out[2];314}315316if (Alpha)317{318result[3] = out[3];319}320}321322void LogicOpFunc(SWR_LOGIC_OP logicOp, Value* src[4], Value* dst[4], Value* result[4])323{324// Op: (s == PS output, d = RT contents)325switch (logicOp)326{327case LOGICOP_CLEAR:328result[0] = VIMMED1(0);329result[1] = VIMMED1(0);330result[2] = VIMMED1(0);331result[3] = VIMMED1(0);332break;333334case LOGICOP_NOR:335// ~(s | d)336result[0] = XOR(OR(src[0], dst[0]), VIMMED1(0xFFFFFFFF));337result[1] = XOR(OR(src[1], dst[1]), VIMMED1(0xFFFFFFFF));338result[2] = XOR(OR(src[2], dst[2]), VIMMED1(0xFFFFFFFF));339result[3] = XOR(OR(src[3], dst[3]), VIMMED1(0xFFFFFFFF));340break;341342case LOGICOP_AND_INVERTED:343// ~s & d344// todo: use avx andnot instr when I can find the intrinsic to call345result[0] = AND(XOR(src[0], VIMMED1(0xFFFFFFFF)), dst[0]);346result[1] = AND(XOR(src[1], VIMMED1(0xFFFFFFFF)), dst[1]);347result[2] = AND(XOR(src[2], VIMMED1(0xFFFFFFFF)), dst[2]);348result[3] = AND(XOR(src[3], VIMMED1(0xFFFFFFFF)), dst[3]);349break;350351case LOGICOP_COPY_INVERTED:352// ~s353result[0] = XOR(src[0], VIMMED1(0xFFFFFFFF));354result[1] = XOR(src[1], VIMMED1(0xFFFFFFFF));355result[2] = XOR(src[2], VIMMED1(0xFFFFFFFF));356result[3] = XOR(src[3], VIMMED1(0xFFFFFFFF));357break;358359case LOGICOP_AND_REVERSE:360// s & ~d361// todo: use avx andnot instr when I can find the intrinsic to call362result[0] = AND(XOR(dst[0], VIMMED1(0xFFFFFFFF)), src[0]);363result[1] = AND(XOR(dst[1], VIMMED1(0xFFFFFFFF)), src[1]);364result[2] = AND(XOR(dst[2], VIMMED1(0xFFFFFFFF)), src[2]);365result[3] = AND(XOR(dst[3], VIMMED1(0xFFFFFFFF)), src[3]);366break;367368case LOGICOP_INVERT:369// ~d370result[0] = XOR(dst[0], VIMMED1(0xFFFFFFFF));371result[1] = XOR(dst[1], VIMMED1(0xFFFFFFFF));372result[2] = XOR(dst[2], VIMMED1(0xFFFFFFFF));373result[3] = XOR(dst[3], VIMMED1(0xFFFFFFFF));374break;375376case LOGICOP_XOR:377// s ^ d378result[0] = XOR(src[0], dst[0]);379result[1] = XOR(src[1], dst[1]);380result[2] = XOR(src[2], dst[2]);381result[3] = XOR(src[3], dst[3]);382break;383384case LOGICOP_NAND:385// ~(s & d)386result[0] = XOR(AND(src[0], dst[0]), VIMMED1(0xFFFFFFFF));387result[1] = XOR(AND(src[1], dst[1]), VIMMED1(0xFFFFFFFF));388result[2] = XOR(AND(src[2], dst[2]), VIMMED1(0xFFFFFFFF));389result[3] = XOR(AND(src[3], dst[3]), VIMMED1(0xFFFFFFFF));390break;391392case LOGICOP_AND:393// s & d394result[0] = AND(src[0], dst[0]);395result[1] = AND(src[1], dst[1]);396result[2] = AND(src[2], dst[2]);397result[3] = AND(src[3], dst[3]);398break;399400case LOGICOP_EQUIV:401// ~(s ^ d)402result[0] = XOR(XOR(src[0], dst[0]), VIMMED1(0xFFFFFFFF));403result[1] = XOR(XOR(src[1], dst[1]), VIMMED1(0xFFFFFFFF));404result[2] = XOR(XOR(src[2], dst[2]), VIMMED1(0xFFFFFFFF));405result[3] = XOR(XOR(src[3], dst[3]), VIMMED1(0xFFFFFFFF));406break;407408case LOGICOP_NOOP:409result[0] = dst[0];410result[1] = dst[1];411result[2] = dst[2];412result[3] = dst[3];413break;414415case LOGICOP_OR_INVERTED:416// ~s | d417result[0] = OR(XOR(src[0], VIMMED1(0xFFFFFFFF)), dst[0]);418result[1] = OR(XOR(src[1], VIMMED1(0xFFFFFFFF)), dst[1]);419result[2] = OR(XOR(src[2], VIMMED1(0xFFFFFFFF)), dst[2]);420result[3] = OR(XOR(src[3], VIMMED1(0xFFFFFFFF)), dst[3]);421break;422423case LOGICOP_COPY:424result[0] = src[0];425result[1] = src[1];426result[2] = src[2];427result[3] = src[3];428break;429430case LOGICOP_OR_REVERSE:431// s | ~d432result[0] = OR(XOR(dst[0], VIMMED1(0xFFFFFFFF)), src[0]);433result[1] = OR(XOR(dst[1], VIMMED1(0xFFFFFFFF)), src[1]);434result[2] = OR(XOR(dst[2], VIMMED1(0xFFFFFFFF)), src[2]);435result[3] = OR(XOR(dst[3], VIMMED1(0xFFFFFFFF)), src[3]);436break;437438case LOGICOP_OR:439// s | d440result[0] = OR(src[0], dst[0]);441result[1] = OR(src[1], dst[1]);442result[2] = OR(src[2], dst[2]);443result[3] = OR(src[3], dst[3]);444break;445446case LOGICOP_SET:447result[0] = VIMMED1(0xFFFFFFFF);448result[1] = VIMMED1(0xFFFFFFFF);449result[2] = VIMMED1(0xFFFFFFFF);450result[3] = VIMMED1(0xFFFFFFFF);451break;452453default:454SWR_INVALID("Unsupported logic operation: %d", logicOp);455result[0] = result[1] = result[2] = result[3] = VIMMED1(0.0f);456break;457}458}459460void461AlphaTest(const BLEND_COMPILE_STATE& state, Value* pBlendState, Value* ppAlpha, Value* ppMask)462{463// load uint32_t reference464Value* pRef = VBROADCAST(LOAD(pBlendState, {0, SWR_BLEND_STATE_alphaTestReference}));465466// load alpha467Value* pAlpha = LOAD(ppAlpha, {0, 0});468469Value* pTest = nullptr;470if (state.alphaTestFormat == ALPHA_TEST_UNORM8)471{472// convert float alpha to unorm8473Value* pAlphaU8 = FMUL(pAlpha, VIMMED1(256.0f));474pAlphaU8 = FP_TO_UI(pAlphaU8, mSimdInt32Ty);475476// compare477switch (state.alphaTestFunction)478{479case ZFUNC_ALWAYS:480pTest = VIMMED1(true);481break;482case ZFUNC_NEVER:483pTest = VIMMED1(false);484break;485case ZFUNC_LT:486pTest = ICMP_ULT(pAlphaU8, pRef);487break;488case ZFUNC_EQ:489pTest = ICMP_EQ(pAlphaU8, pRef);490break;491case ZFUNC_LE:492pTest = ICMP_ULE(pAlphaU8, pRef);493break;494case ZFUNC_GT:495pTest = ICMP_UGT(pAlphaU8, pRef);496break;497case ZFUNC_NE:498pTest = ICMP_NE(pAlphaU8, pRef);499break;500case ZFUNC_GE:501pTest = ICMP_UGE(pAlphaU8, pRef);502break;503default:504SWR_INVALID("Invalid alpha test function");505break;506}507}508else509{510// cast ref to float511pRef = BITCAST(pRef, mSimdFP32Ty);512513// compare514switch (state.alphaTestFunction)515{516case ZFUNC_ALWAYS:517pTest = VIMMED1(true);518break;519case ZFUNC_NEVER:520pTest = VIMMED1(false);521break;522case ZFUNC_LT:523pTest = FCMP_OLT(pAlpha, pRef);524break;525case ZFUNC_EQ:526pTest = FCMP_OEQ(pAlpha, pRef);527break;528case ZFUNC_LE:529pTest = FCMP_OLE(pAlpha, pRef);530break;531case ZFUNC_GT:532pTest = FCMP_OGT(pAlpha, pRef);533break;534case ZFUNC_NE:535pTest = FCMP_ONE(pAlpha, pRef);536break;537case ZFUNC_GE:538pTest = FCMP_OGE(pAlpha, pRef);539break;540default:541SWR_INVALID("Invalid alpha test function");542break;543}544}545546// load current mask547Value* pMask = LOAD(ppMask);548549// convert to int1 mask550pMask = MASK(pMask);551552// and with alpha test result553pMask = AND(pMask, pTest);554555// convert back to vector mask556pMask = VMASK(pMask);557558// store new mask559STORE(pMask, ppMask);560}561562Function* Create(const BLEND_COMPILE_STATE& state)563{564std::stringstream fnName("BLND_",565std::ios_base::in | std::ios_base::out | std::ios_base::ate);566fnName << ComputeCRC(0, &state, sizeof(state));567568// blend function signature569// typedef void(*PFN_BLEND_JIT_FUNC)(const SWR_BLEND_CONTEXT*);570571std::vector<Type*> args{572PointerType::get(Gen_SWR_BLEND_CONTEXT(JM()), 0) // SWR_BLEND_CONTEXT*573};574575// std::vector<Type*> args{576// PointerType::get(Gen_SWR_BLEND_CONTEXT(JM()), 0), // SWR_BLEND_CONTEXT*577//};578579FunctionType* fTy = FunctionType::get(IRB()->getVoidTy(), args, false);580Function* blendFunc = Function::Create(581fTy, GlobalValue::ExternalLinkage, fnName.str(), JM()->mpCurrentModule);582blendFunc->getParent()->setModuleIdentifier(blendFunc->getName());583584BasicBlock* entry = BasicBlock::Create(JM()->mContext, "entry", blendFunc);585586IRB()->SetInsertPoint(entry);587588// arguments589auto argitr = blendFunc->arg_begin();590Value* pBlendContext = &*argitr++;591pBlendContext->setName("pBlendContext");592Value* pBlendState = LOAD(pBlendContext, {0, SWR_BLEND_CONTEXT_pBlendState});593pBlendState->setName("pBlendState");594Value* pSrc = LOAD(pBlendContext, {0, SWR_BLEND_CONTEXT_src});595pSrc->setName("src");596Value* pSrc1 = LOAD(pBlendContext, {0, SWR_BLEND_CONTEXT_src1});597pSrc1->setName("src1");598Value* pSrc0Alpha = LOAD(pBlendContext, {0, SWR_BLEND_CONTEXT_src0alpha});599pSrc0Alpha->setName("src0alpha");600Value* sampleNum = LOAD(pBlendContext, {0, SWR_BLEND_CONTEXT_sampleNum});601sampleNum->setName("sampleNum");602Value* pDst = LOAD(pBlendContext, {0, SWR_BLEND_CONTEXT_pDst});603pDst->setName("pDst");604Value* pResult = LOAD(pBlendContext, {0, SWR_BLEND_CONTEXT_result});605pResult->setName("result");606Value* ppoMask = LOAD(pBlendContext, {0, SWR_BLEND_CONTEXT_oMask});607ppoMask->setName("ppoMask");608Value* ppMask = LOAD(pBlendContext, {0, SWR_BLEND_CONTEXT_pMask});609ppMask->setName("pMask");610611static_assert(KNOB_COLOR_HOT_TILE_FORMAT == R32G32B32A32_FLOAT,612"Unsupported hot tile format");613Value* dst[4];614Value* constantColor[4];615Value* src[4];616Value* src1[4];617Value* result[4];618for (uint32_t i = 0; i < 4; ++i)619{620// load hot tile621dst[i] = LOAD(pDst, {0, i});622623// load constant color624constantColor[i] = VBROADCAST(LOAD(pBlendState, {0, SWR_BLEND_STATE_constantColor, i}));625626// load src627src[i] = LOAD(pSrc, {0, i});628629// load src1630src1[i] = LOAD(pSrc1, {0, i});631}632Value* currentSampleMask = VIMMED1(-1);633if (state.desc.alphaToCoverageEnable)634{635Value* pClampedSrc = FCLAMP(src[3], 0.0f, 1.0f);636uint32_t bits = (1 << state.desc.numSamples) - 1;637currentSampleMask = FMUL(pClampedSrc, VBROADCAST(C((float)bits)));638currentSampleMask = FP_TO_SI(FADD(currentSampleMask, VIMMED1(0.5f)), mSimdInt32Ty);639}640641// alpha test642if (state.desc.alphaTestEnable)643{644// Gather for archrast stats645STORE(C(1), pBlendContext, {0, SWR_BLEND_CONTEXT_isAlphaTested});646AlphaTest(state, pBlendState, pSrc0Alpha, ppMask);647}648else649{650// Gather for archrast stats651STORE(C(0), pBlendContext, {0, SWR_BLEND_CONTEXT_isAlphaTested});652}653654// color blend655if (state.blendState.blendEnable)656{657// Gather for archrast stats658STORE(C(1), pBlendContext, {0, SWR_BLEND_CONTEXT_isAlphaBlended});659660// clamp sources661Clamp(state.format, src);662Clamp(state.format, src1);663Clamp(state.format, dst);664Clamp(state.format, constantColor);665666// apply defaults to hottile contents to take into account missing components667ApplyDefaults(state.format, dst);668669// Force defaults for unused 'X' components670ApplyUnusedDefaults(state.format, dst);671672// Quantize low precision components673Quantize(state.format, dst);674675// special case clamping for R11G11B10_float which has no sign bit676if (state.format == R11G11B10_FLOAT)677{678dst[0] = VMAXPS(dst[0], VIMMED1(0.0f));679dst[1] = VMAXPS(dst[1], VIMMED1(0.0f));680dst[2] = VMAXPS(dst[2], VIMMED1(0.0f));681dst[3] = VMAXPS(dst[3], VIMMED1(0.0f));682}683684Value* srcFactor[4];685Value* dstFactor[4];686if (state.desc.independentAlphaBlendEnable)687{688GenerateBlendFactor<true, false>(689state.blendState.sourceBlendFactor, constantColor, src, src1, dst, srcFactor);690GenerateBlendFactor<false, true>(state.blendState.sourceAlphaBlendFactor,691constantColor,692src,693src1,694dst,695srcFactor);696697GenerateBlendFactor<true, false>(698state.blendState.destBlendFactor, constantColor, src, src1, dst, dstFactor);699GenerateBlendFactor<false, true>(state.blendState.destAlphaBlendFactor,700constantColor,701src,702src1,703dst,704dstFactor);705706BlendFunc<true, false>(707state.blendState.colorBlendFunc, src, srcFactor, dst, dstFactor, result);708BlendFunc<false, true>(709state.blendState.alphaBlendFunc, src, srcFactor, dst, dstFactor, result);710}711else712{713GenerateBlendFactor<true, true>(714state.blendState.sourceBlendFactor, constantColor, src, src1, dst, srcFactor);715GenerateBlendFactor<true, true>(716state.blendState.destBlendFactor, constantColor, src, src1, dst, dstFactor);717718BlendFunc<true, true>(719state.blendState.colorBlendFunc, src, srcFactor, dst, dstFactor, result);720}721722// store results out723for (uint32_t i = 0; i < 4; ++i)724{725STORE(result[i], pResult, {0, i});726}727}728else729{730// Gather for archrast stats731STORE(C(0), pBlendContext, {0, SWR_BLEND_CONTEXT_isAlphaBlended});732}733734if (state.blendState.logicOpEnable)735{736const SWR_FORMAT_INFO& info = GetFormatInfo(state.format);737Value* vMask[4];738float scale[4];739740if (!state.blendState.blendEnable)741{742Clamp(state.format, src);743Clamp(state.format, dst);744}745746for (uint32_t i = 0; i < 4; i++)747{748if (info.type[i] == SWR_TYPE_UNUSED)749{750continue;751}752753if (info.bpc[i] >= 32)754{755vMask[i] = VIMMED1(0xFFFFFFFF);756scale[i] = 0xFFFFFFFF;757}758else759{760vMask[i] = VIMMED1((1 << info.bpc[i]) - 1);761if (info.type[i] == SWR_TYPE_SNORM)762scale[i] = (1 << (info.bpc[i] - 1)) - 1;763else764scale[i] = (1 << info.bpc[i]) - 1;765}766767switch (info.type[i])768{769default:770SWR_INVALID("Unsupported type for logic op: %d", info.type[i]);771break;772773case SWR_TYPE_UNKNOWN:774case SWR_TYPE_UNUSED:775FALLTHROUGH;776777case SWR_TYPE_UINT:778case SWR_TYPE_SINT:779src[i] = BITCAST(src[i], mSimdInt32Ty);780dst[i] = BITCAST(dst[i], mSimdInt32Ty);781break;782case SWR_TYPE_SNORM:783src[i] = FP_TO_SI(FMUL(src[i], VIMMED1(scale[i])), mSimdInt32Ty);784dst[i] = FP_TO_SI(FMUL(dst[i], VIMMED1(scale[i])), mSimdInt32Ty);785break;786case SWR_TYPE_UNORM:787src[i] = FP_TO_UI(FMUL(src[i], VIMMED1(scale[i])), mSimdInt32Ty);788dst[i] = FP_TO_UI(FMUL(dst[i], VIMMED1(scale[i])), mSimdInt32Ty);789break;790}791}792793LogicOpFunc(state.blendState.logicOpFunc, src, dst, result);794795// store results out796for (uint32_t i = 0; i < 4; ++i)797{798if (info.type[i] == SWR_TYPE_UNUSED)799{800continue;801}802803// clear upper bits from PS output not in RT format after doing logic op804result[i] = AND(result[i], vMask[i]);805806switch (info.type[i])807{808default:809SWR_INVALID("Unsupported type for logic op: %d", info.type[i]);810break;811812case SWR_TYPE_UNKNOWN:813case SWR_TYPE_UNUSED:814FALLTHROUGH;815816case SWR_TYPE_UINT:817case SWR_TYPE_SINT:818result[i] = BITCAST(result[i], mSimdFP32Ty);819break;820case SWR_TYPE_SNORM:821result[i] = SHL(result[i], C(32 - info.bpc[i]));822result[i] = ASHR(result[i], C(32 - info.bpc[i]));823result[i] = FMUL(SI_TO_FP(result[i], mSimdFP32Ty), VIMMED1(1.0f / scale[i]));824break;825case SWR_TYPE_UNORM:826result[i] = FMUL(UI_TO_FP(result[i], mSimdFP32Ty), VIMMED1(1.0f / scale[i]));827break;828}829830STORE(result[i], pResult, {0, i});831}832}833834if (state.desc.oMaskEnable)835{836assert(!(state.desc.alphaToCoverageEnable));837// load current mask838Value* oMask = LOAD(ppoMask);839currentSampleMask = AND(oMask, currentSampleMask);840}841842if (state.desc.sampleMaskEnable)843{844Value* sampleMask = LOAD(pBlendState, {0, SWR_BLEND_STATE_sampleMask});845currentSampleMask = AND(VBROADCAST(sampleMask), currentSampleMask);846}847848if (state.desc.sampleMaskEnable || state.desc.alphaToCoverageEnable ||849state.desc.oMaskEnable)850{851// load coverage mask and mask off any lanes with no samples852Value* pMask = LOAD(ppMask);853Value* sampleMasked = SHL(C(1), sampleNum);854currentSampleMask = AND(currentSampleMask, VBROADCAST(sampleMasked));855currentSampleMask = S_EXT(ICMP_UGT(currentSampleMask, VBROADCAST(C(0))), mSimdInt32Ty);856Value* outputMask = AND(pMask, currentSampleMask);857// store new mask858STORE(outputMask, GEP(ppMask, C(0)));859}860861RET_VOID();862863JitManager::DumpToFile(blendFunc, "");864865::FunctionPassManager passes(JM()->mpCurrentModule);866867passes.add(createBreakCriticalEdgesPass());868passes.add(createCFGSimplificationPass());869passes.add(createEarlyCSEPass());870passes.add(createPromoteMemoryToRegisterPass());871passes.add(createCFGSimplificationPass());872passes.add(createEarlyCSEPass());873passes.add(createInstructionCombiningPass());874#if LLVM_VERSION_MAJOR <= 11875passes.add(createConstantPropagationPass());876#endif877passes.add(createSCCPPass());878passes.add(createAggressiveDCEPass());879880passes.add(createLowerX86Pass(this));881882passes.run(*blendFunc);883884JitManager::DumpToFile(blendFunc, "optimized");885886return blendFunc;887}888};889890//////////////////////////////////////////////////////////////////////////891/// @brief JITs from fetch shader IR892/// @param hJitMgr - JitManager handle893/// @param func - LLVM function IR894/// @return PFN_FETCH_FUNC - pointer to fetch code895PFN_BLEND_JIT_FUNC JitBlendFunc(HANDLE hJitMgr, const HANDLE hFunc)896{897const llvm::Function* func = (const llvm::Function*)hFunc;898JitManager* pJitMgr = reinterpret_cast<JitManager*>(hJitMgr);899PFN_BLEND_JIT_FUNC pfnBlend;900pfnBlend = (PFN_BLEND_JIT_FUNC)(pJitMgr->mpExec->getFunctionAddress(func->getName().str()));901// MCJIT finalizes modules the first time you JIT code from them. After finalized, you cannot902// add new IR to the module903pJitMgr->mIsModuleFinalized = true;904905return pfnBlend;906}907908//////////////////////////////////////////////////////////////////////////909/// @brief JIT compiles blend shader910/// @param hJitMgr - JitManager handle911/// @param state - blend state to build function from912extern "C" PFN_BLEND_JIT_FUNC JITCALL JitCompileBlend(HANDLE hJitMgr,913const BLEND_COMPILE_STATE& state)914{915JitManager* pJitMgr = reinterpret_cast<JitManager*>(hJitMgr);916917pJitMgr->SetupNewModule();918919BlendJit theJit(pJitMgr);920HANDLE hFunc = theJit.Create(state);921922return JitBlendFunc(hJitMgr, hFunc);923}924925926