Path: blob/21.2-virgl/src/amd/compiler/aco_assembler.cpp
4550 views
/*1* Copyright © 2018 Valve Corporation2*3* Permission is hereby granted, free of charge, to any person obtaining a4* copy of this software and associated documentation files (the "Software"),5* to deal in the Software without restriction, including without limitation6* the rights to use, copy, modify, merge, publish, distribute, sublicense,7* and/or sell copies of the Software, and to permit persons to whom the8* Software is furnished to do so, subject to the following conditions:9*10* The above copyright notice and this permission notice (including the next11* paragraph) shall be included in all copies or substantial portions of the12* Software.13*14* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR15* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,16* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL17* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER18* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING19* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS20* IN THE SOFTWARE.21*22*/2324#include "aco_builder.h"25#include "aco_ir.h"2627#include "common/sid.h"2829#include "util/memstream.h"3031#include <algorithm>32#include <map>33#include <vector>3435namespace aco {3637struct constaddr_info {38unsigned getpc_end;39unsigned add_literal;40};4142struct asm_context {43Program* program;44enum chip_class chip_class;45std::vector<std::pair<int, SOPP_instruction*>> branches;46std::map<unsigned, constaddr_info> constaddrs;47const int16_t* opcode;48// TODO: keep track of branch instructions referring blocks49// and, when emitting the block, correct the offset in instr50asm_context(Program* program_) : program(program_), chip_class(program->chip_class)51{52if (chip_class <= GFX7)53opcode = &instr_info.opcode_gfx7[0];54else if (chip_class <= GFX9)55opcode = &instr_info.opcode_gfx9[0];56else if (chip_class >= GFX10)57opcode = &instr_info.opcode_gfx10[0];58}5960int subvector_begin_pos = -1;61};6263static uint32_t64get_sdwa_sel(unsigned sel, PhysReg reg)65{66if (sel & sdwa_isra) {67unsigned size = sdwa_rasize & sel;68if (size == 1)69return reg.byte();70else /* size == 2 */71return sdwa_isword | (reg.byte() >> 1);72}73return sel & sdwa_asuint;74}7576unsigned77get_mimg_nsa_dwords(const Instruction* instr)78{79unsigned addr_dwords = instr->operands.size() - 3;80for (unsigned i = 1; i < addr_dwords; i++) {81if (instr->operands[3 + i].physReg() != instr->operands[3].physReg().advance(i * 4))82return DIV_ROUND_UP(addr_dwords - 1, 4);83}84return 0;85}8687void88emit_instruction(asm_context& ctx, std::vector<uint32_t>& out, Instruction* instr)89{90/* lower remaining pseudo-instructions */91if (instr->opcode == aco_opcode::p_constaddr_getpc) {92ctx.constaddrs[instr->operands[0].constantValue()].getpc_end = out.size() + 1;9394instr->opcode = aco_opcode::s_getpc_b64;95instr->operands.pop_back();96} else if (instr->opcode == aco_opcode::p_constaddr_addlo) {97ctx.constaddrs[instr->operands[1].constantValue()].add_literal = out.size() + 1;9899instr->opcode = aco_opcode::s_add_u32;100instr->operands[1] = Operand::zero();101instr->operands[1].setFixed(PhysReg(255));102}103104uint32_t opcode = ctx.opcode[(int)instr->opcode];105if (opcode == (uint32_t)-1) {106char* outmem;107size_t outsize;108struct u_memstream mem;109u_memstream_open(&mem, &outmem, &outsize);110FILE* const memf = u_memstream_get(&mem);111112fprintf(memf, "Unsupported opcode: ");113aco_print_instr(instr, memf);114u_memstream_close(&mem);115116aco_err(ctx.program, outmem);117free(outmem);118119abort();120}121122switch (instr->format) {123case Format::SOP2: {124uint32_t encoding = (0b10 << 30);125encoding |= opcode << 23;126encoding |= !instr->definitions.empty() ? instr->definitions[0].physReg() << 16 : 0;127encoding |= instr->operands.size() >= 2 ? instr->operands[1].physReg() << 8 : 0;128encoding |= !instr->operands.empty() ? instr->operands[0].physReg() : 0;129out.push_back(encoding);130break;131}132case Format::SOPK: {133SOPK_instruction& sopk = instr->sopk();134135if (instr->opcode == aco_opcode::s_subvector_loop_begin) {136assert(ctx.chip_class >= GFX10);137assert(ctx.subvector_begin_pos == -1);138ctx.subvector_begin_pos = out.size();139} else if (instr->opcode == aco_opcode::s_subvector_loop_end) {140assert(ctx.chip_class >= GFX10);141assert(ctx.subvector_begin_pos != -1);142/* Adjust s_subvector_loop_begin instruction to the address after the end */143out[ctx.subvector_begin_pos] |= (out.size() - ctx.subvector_begin_pos);144/* Adjust s_subvector_loop_end instruction to the address after the beginning */145sopk.imm = (uint16_t)(ctx.subvector_begin_pos - (int)out.size());146ctx.subvector_begin_pos = -1;147}148149uint32_t encoding = (0b1011 << 28);150encoding |= opcode << 23;151encoding |= !instr->definitions.empty() && !(instr->definitions[0].physReg() == scc)152? instr->definitions[0].physReg() << 16153: !instr->operands.empty() && instr->operands[0].physReg() <= 127154? instr->operands[0].physReg() << 16155: 0;156encoding |= sopk.imm;157out.push_back(encoding);158break;159}160case Format::SOP1: {161uint32_t encoding = (0b101111101 << 23);162if (opcode >= 55 && ctx.chip_class <= GFX9) {163assert(ctx.chip_class == GFX9 && opcode < 60);164opcode = opcode - 4;165}166encoding |= !instr->definitions.empty() ? instr->definitions[0].physReg() << 16 : 0;167encoding |= opcode << 8;168encoding |= !instr->operands.empty() ? instr->operands[0].physReg() : 0;169out.push_back(encoding);170break;171}172case Format::SOPC: {173uint32_t encoding = (0b101111110 << 23);174encoding |= opcode << 16;175encoding |= instr->operands.size() == 2 ? instr->operands[1].physReg() << 8 : 0;176encoding |= !instr->operands.empty() ? instr->operands[0].physReg() : 0;177out.push_back(encoding);178break;179}180case Format::SOPP: {181SOPP_instruction& sopp = instr->sopp();182uint32_t encoding = (0b101111111 << 23);183encoding |= opcode << 16;184encoding |= (uint16_t)sopp.imm;185if (sopp.block != -1) {186sopp.pass_flags = 0;187ctx.branches.emplace_back(out.size(), &sopp);188}189out.push_back(encoding);190break;191}192case Format::SMEM: {193SMEM_instruction& smem = instr->smem();194bool soe = instr->operands.size() >= (!instr->definitions.empty() ? 3 : 4);195bool is_load = !instr->definitions.empty();196uint32_t encoding = 0;197198if (ctx.chip_class <= GFX7) {199encoding = (0b11000 << 27);200encoding |= opcode << 22;201encoding |= instr->definitions.size() ? instr->definitions[0].physReg() << 15 : 0;202encoding |= instr->operands.size() ? (instr->operands[0].physReg() >> 1) << 9 : 0;203if (instr->operands.size() >= 2) {204if (!instr->operands[1].isConstant()) {205encoding |= instr->operands[1].physReg().reg();206} else if (instr->operands[1].constantValue() >= 1024) {207encoding |= 255; /* SQ_SRC_LITERAL */208} else {209encoding |= instr->operands[1].constantValue() >> 2;210encoding |= 1 << 8;211}212}213out.push_back(encoding);214/* SMRD instructions can take a literal on GFX7 */215if (instr->operands.size() >= 2 && instr->operands[1].isConstant() &&216instr->operands[1].constantValue() >= 1024)217out.push_back(instr->operands[1].constantValue() >> 2);218return;219}220221if (ctx.chip_class <= GFX9) {222encoding = (0b110000 << 26);223assert(!smem.dlc); /* Device-level coherent is not supported on GFX9 and lower */224encoding |= smem.nv ? 1 << 15 : 0;225} else {226encoding = (0b111101 << 26);227assert(!smem.nv); /* Non-volatile is not supported on GFX10 */228encoding |= smem.dlc ? 1 << 14 : 0;229}230231encoding |= opcode << 18;232encoding |= smem.glc ? 1 << 16 : 0;233234if (ctx.chip_class <= GFX9) {235if (instr->operands.size() >= 2)236encoding |= instr->operands[1].isConstant() ? 1 << 17 : 0; /* IMM - immediate enable */237}238if (ctx.chip_class == GFX9) {239encoding |= soe ? 1 << 14 : 0;240}241242if (is_load || instr->operands.size() >= 3) { /* SDATA */243encoding |= (is_load ? instr->definitions[0].physReg() : instr->operands[2].physReg())244<< 6;245}246if (instr->operands.size() >= 1) { /* SBASE */247encoding |= instr->operands[0].physReg() >> 1;248}249250out.push_back(encoding);251encoding = 0;252253int32_t offset = 0;254uint32_t soffset = ctx.chip_class >= GFX10255? sgpr_null /* On GFX10 this is disabled by specifying SGPR_NULL */256: 0; /* On GFX9, it is disabled by the SOE bit (and it's not present on257GFX8 and below) */258if (instr->operands.size() >= 2) {259const Operand& op_off1 = instr->operands[1];260if (ctx.chip_class <= GFX9) {261offset = op_off1.isConstant() ? op_off1.constantValue() : op_off1.physReg();262} else {263/* GFX10 only supports constants in OFFSET, so put the operand in SOFFSET if it's an264* SGPR */265if (op_off1.isConstant()) {266offset = op_off1.constantValue();267} else {268soffset = op_off1.physReg();269assert(!soe); /* There is no place to put the other SGPR offset, if any */270}271}272273if (soe) {274const Operand& op_off2 = instr->operands.back();275assert(ctx.chip_class >= GFX9); /* GFX8 and below don't support specifying a constant276and an SGPR at the same time */277assert(!op_off2.isConstant());278soffset = op_off2.physReg();279}280}281encoding |= offset;282encoding |= soffset << 25;283284out.push_back(encoding);285return;286}287case Format::VOP2: {288uint32_t encoding = 0;289encoding |= opcode << 25;290encoding |= (0xFF & instr->definitions[0].physReg()) << 17;291encoding |= (0xFF & instr->operands[1].physReg()) << 9;292encoding |= instr->operands[0].physReg();293out.push_back(encoding);294break;295}296case Format::VOP1: {297uint32_t encoding = (0b0111111 << 25);298if (!instr->definitions.empty())299encoding |= (0xFF & instr->definitions[0].physReg()) << 17;300encoding |= opcode << 9;301if (!instr->operands.empty())302encoding |= instr->operands[0].physReg();303out.push_back(encoding);304break;305}306case Format::VOPC: {307uint32_t encoding = (0b0111110 << 25);308encoding |= opcode << 17;309encoding |= (0xFF & instr->operands[1].physReg()) << 9;310encoding |= instr->operands[0].physReg();311out.push_back(encoding);312break;313}314case Format::VINTRP: {315Interp_instruction& interp = instr->vintrp();316uint32_t encoding = 0;317318if (instr->opcode == aco_opcode::v_interp_p1ll_f16 ||319instr->opcode == aco_opcode::v_interp_p1lv_f16 ||320instr->opcode == aco_opcode::v_interp_p2_legacy_f16 ||321instr->opcode == aco_opcode::v_interp_p2_f16) {322if (ctx.chip_class == GFX8 || ctx.chip_class == GFX9) {323encoding = (0b110100 << 26);324} else if (ctx.chip_class >= GFX10) {325encoding = (0b110101 << 26);326} else {327unreachable("Unknown chip_class.");328}329330encoding |= opcode << 16;331encoding |= (0xFF & instr->definitions[0].physReg());332out.push_back(encoding);333334encoding = 0;335encoding |= interp.attribute;336encoding |= interp.component << 6;337encoding |= instr->operands[0].physReg() << 9;338if (instr->opcode == aco_opcode::v_interp_p2_f16 ||339instr->opcode == aco_opcode::v_interp_p2_legacy_f16 ||340instr->opcode == aco_opcode::v_interp_p1lv_f16) {341encoding |= instr->operands[2].physReg() << 18;342}343out.push_back(encoding);344} else {345if (ctx.chip_class == GFX8 || ctx.chip_class == GFX9) {346encoding = (0b110101 << 26); /* Vega ISA doc says 110010 but it's wrong */347} else {348encoding = (0b110010 << 26);349}350351assert(encoding);352encoding |= (0xFF & instr->definitions[0].physReg()) << 18;353encoding |= opcode << 16;354encoding |= interp.attribute << 10;355encoding |= interp.component << 8;356if (instr->opcode == aco_opcode::v_interp_mov_f32)357encoding |= (0x3 & instr->operands[0].constantValue());358else359encoding |= (0xFF & instr->operands[0].physReg());360out.push_back(encoding);361}362break;363}364case Format::DS: {365DS_instruction& ds = instr->ds();366uint32_t encoding = (0b110110 << 26);367if (ctx.chip_class == GFX8 || ctx.chip_class == GFX9) {368encoding |= opcode << 17;369encoding |= (ds.gds ? 1 : 0) << 16;370} else {371encoding |= opcode << 18;372encoding |= (ds.gds ? 1 : 0) << 17;373}374encoding |= ((0xFF & ds.offset1) << 8);375encoding |= (0xFFFF & ds.offset0);376out.push_back(encoding);377encoding = 0;378unsigned reg = !instr->definitions.empty() ? instr->definitions[0].physReg() : 0;379encoding |= (0xFF & reg) << 24;380reg = instr->operands.size() >= 3 && !(instr->operands[2].physReg() == m0)381? instr->operands[2].physReg()382: 0;383encoding |= (0xFF & reg) << 16;384reg = instr->operands.size() >= 2 && !(instr->operands[1].physReg() == m0)385? instr->operands[1].physReg()386: 0;387encoding |= (0xFF & reg) << 8;388encoding |= (0xFF & instr->operands[0].physReg());389out.push_back(encoding);390break;391}392case Format::MUBUF: {393MUBUF_instruction& mubuf = instr->mubuf();394uint32_t encoding = (0b111000 << 26);395encoding |= opcode << 18;396encoding |= (mubuf.lds ? 1 : 0) << 16;397encoding |= (mubuf.glc ? 1 : 0) << 14;398encoding |= (mubuf.idxen ? 1 : 0) << 13;399assert(!mubuf.addr64 || ctx.chip_class <= GFX7);400if (ctx.chip_class == GFX6 || ctx.chip_class == GFX7)401encoding |= (mubuf.addr64 ? 1 : 0) << 15;402encoding |= (mubuf.offen ? 1 : 0) << 12;403if (ctx.chip_class == GFX8 || ctx.chip_class == GFX9) {404assert(!mubuf.dlc); /* Device-level coherent is not supported on GFX9 and lower */405encoding |= (mubuf.slc ? 1 : 0) << 17;406} else if (ctx.chip_class >= GFX10) {407encoding |= (mubuf.dlc ? 1 : 0) << 15;408}409encoding |= 0x0FFF & mubuf.offset;410out.push_back(encoding);411encoding = 0;412if (ctx.chip_class <= GFX7 || ctx.chip_class >= GFX10) {413encoding |= (mubuf.slc ? 1 : 0) << 22;414}415encoding |= instr->operands[2].physReg() << 24;416encoding |= (mubuf.tfe ? 1 : 0) << 23;417encoding |= (instr->operands[0].physReg() >> 2) << 16;418unsigned reg = instr->operands.size() > 3 ? instr->operands[3].physReg()419: instr->definitions[0].physReg();420encoding |= (0xFF & reg) << 8;421encoding |= (0xFF & instr->operands[1].physReg());422out.push_back(encoding);423break;424}425case Format::MTBUF: {426MTBUF_instruction& mtbuf = instr->mtbuf();427428uint32_t img_format = ac_get_tbuffer_format(ctx.chip_class, mtbuf.dfmt, mtbuf.nfmt);429uint32_t encoding = (0b111010 << 26);430assert(img_format <= 0x7F);431assert(!mtbuf.dlc || ctx.chip_class >= GFX10);432encoding |= (mtbuf.dlc ? 1 : 0) << 15; /* DLC bit replaces one bit of the OPCODE on GFX10 */433encoding |= (mtbuf.glc ? 1 : 0) << 14;434encoding |= (mtbuf.idxen ? 1 : 0) << 13;435encoding |= (mtbuf.offen ? 1 : 0) << 12;436encoding |= 0x0FFF & mtbuf.offset;437encoding |= (img_format << 19); /* Handles both the GFX10 FORMAT and the old NFMT+DFMT */438439if (ctx.chip_class == GFX8 || ctx.chip_class == GFX9) {440encoding |= opcode << 15;441} else {442encoding |= (opcode & 0x07) << 16; /* 3 LSBs of 4-bit OPCODE */443}444445out.push_back(encoding);446encoding = 0;447448encoding |= instr->operands[2].physReg() << 24;449encoding |= (mtbuf.tfe ? 1 : 0) << 23;450encoding |= (mtbuf.slc ? 1 : 0) << 22;451encoding |= (instr->operands[0].physReg() >> 2) << 16;452unsigned reg = instr->operands.size() > 3 ? instr->operands[3].physReg()453: instr->definitions[0].physReg();454encoding |= (0xFF & reg) << 8;455encoding |= (0xFF & instr->operands[1].physReg());456457if (ctx.chip_class >= GFX10) {458encoding |= (((opcode & 0x08) >> 3) << 21); /* MSB of 4-bit OPCODE */459}460461out.push_back(encoding);462break;463}464case Format::MIMG: {465unsigned nsa_dwords = get_mimg_nsa_dwords(instr);466assert(!nsa_dwords || ctx.chip_class >= GFX10);467468MIMG_instruction& mimg = instr->mimg();469uint32_t encoding = (0b111100 << 26);470encoding |= mimg.slc ? 1 << 25 : 0;471encoding |= (opcode & 0x7f) << 18;472encoding |= (opcode >> 7) & 1;473encoding |= mimg.lwe ? 1 << 17 : 0;474encoding |= mimg.tfe ? 1 << 16 : 0;475encoding |= mimg.glc ? 1 << 13 : 0;476encoding |= mimg.unrm ? 1 << 12 : 0;477if (ctx.chip_class <= GFX9) {478assert(!mimg.dlc); /* Device-level coherent is not supported on GFX9 and lower */479assert(!mimg.r128);480encoding |= mimg.a16 ? 1 << 15 : 0;481encoding |= mimg.da ? 1 << 14 : 0;482} else {483encoding |= mimg.r128 ? 1 << 15484: 0; /* GFX10: A16 moved to 2nd word, R128 replaces it in 1st word */485encoding |= nsa_dwords << 1;486encoding |= mimg.dim << 3; /* GFX10: dimensionality instead of declare array */487encoding |= mimg.dlc ? 1 << 7 : 0;488}489encoding |= (0xF & mimg.dmask) << 8;490out.push_back(encoding);491encoding = (0xFF & instr->operands[3].physReg()); /* VADDR */492if (!instr->definitions.empty()) {493encoding |= (0xFF & instr->definitions[0].physReg()) << 8; /* VDATA */494} else if (!instr->operands[2].isUndefined()) {495encoding |= (0xFF & instr->operands[2].physReg()) << 8; /* VDATA */496}497encoding |= (0x1F & (instr->operands[0].physReg() >> 2)) << 16; /* T# (resource) */498if (!instr->operands[1].isUndefined())499encoding |= (0x1F & (instr->operands[1].physReg() >> 2)) << 21; /* sampler */500501assert(!mimg.d16 || ctx.chip_class >= GFX9);502encoding |= mimg.d16 ? 1 << 31 : 0;503if (ctx.chip_class >= GFX10) {504/* GFX10: A16 still exists, but is in a different place */505encoding |= mimg.a16 ? 1 << 30 : 0;506}507508out.push_back(encoding);509510if (nsa_dwords) {511out.resize(out.size() + nsa_dwords);512std::vector<uint32_t>::iterator nsa = std::prev(out.end(), nsa_dwords);513for (unsigned i = 0; i < instr->operands.size() - 4u; i++)514nsa[i / 4] |= (0xFF & instr->operands[4 + i].physReg().reg()) << (i % 4 * 8);515}516break;517}518case Format::FLAT:519case Format::SCRATCH:520case Format::GLOBAL: {521FLAT_instruction& flat = instr->flatlike();522uint32_t encoding = (0b110111 << 26);523encoding |= opcode << 18;524if (ctx.chip_class <= GFX9) {525assert(flat.offset <= 0x1fff);526encoding |= flat.offset & 0x1fff;527} else if (instr->isFlat()) {528/* GFX10 has a 12-bit immediate OFFSET field,529* but it has a hw bug: it ignores the offset, called FlatSegmentOffsetBug530*/531assert(flat.offset == 0);532} else {533assert(flat.offset <= 0xfff);534encoding |= flat.offset & 0xfff;535}536if (instr->isScratch())537encoding |= 1 << 14;538else if (instr->isGlobal())539encoding |= 2 << 14;540encoding |= flat.lds ? 1 << 13 : 0;541encoding |= flat.glc ? 1 << 16 : 0;542encoding |= flat.slc ? 1 << 17 : 0;543if (ctx.chip_class >= GFX10) {544assert(!flat.nv);545encoding |= flat.dlc ? 1 << 12 : 0;546} else {547assert(!flat.dlc);548}549out.push_back(encoding);550encoding = (0xFF & instr->operands[0].physReg());551if (!instr->definitions.empty())552encoding |= (0xFF & instr->definitions[0].physReg()) << 24;553if (instr->operands.size() >= 3)554encoding |= (0xFF & instr->operands[2].physReg()) << 8;555if (!instr->operands[1].isUndefined()) {556assert(ctx.chip_class >= GFX10 || instr->operands[1].physReg() != 0x7F);557assert(instr->format != Format::FLAT);558encoding |= instr->operands[1].physReg() << 16;559} else if (instr->format != Format::FLAT ||560ctx.chip_class >= GFX10) { /* SADDR is actually used with FLAT on GFX10 */561if (ctx.chip_class <= GFX9)562encoding |= 0x7F << 16;563else564encoding |= sgpr_null << 16;565}566encoding |= flat.nv ? 1 << 23 : 0;567out.push_back(encoding);568break;569}570case Format::EXP: {571Export_instruction& exp = instr->exp();572uint32_t encoding;573if (ctx.chip_class == GFX8 || ctx.chip_class == GFX9) {574encoding = (0b110001 << 26);575} else {576encoding = (0b111110 << 26);577}578579encoding |= exp.valid_mask ? 0b1 << 12 : 0;580encoding |= exp.done ? 0b1 << 11 : 0;581encoding |= exp.compressed ? 0b1 << 10 : 0;582encoding |= exp.dest << 4;583encoding |= exp.enabled_mask;584out.push_back(encoding);585encoding = 0xFF & exp.operands[0].physReg();586encoding |= (0xFF & exp.operands[1].physReg()) << 8;587encoding |= (0xFF & exp.operands[2].physReg()) << 16;588encoding |= (0xFF & exp.operands[3].physReg()) << 24;589out.push_back(encoding);590break;591}592case Format::PSEUDO:593case Format::PSEUDO_BARRIER:594if (instr->opcode != aco_opcode::p_unit_test)595unreachable("Pseudo instructions should be lowered before assembly.");596break;597default:598if (instr->isVOP3()) {599VOP3_instruction& vop3 = instr->vop3();600601if (instr->isVOP2()) {602opcode = opcode + 0x100;603} else if (instr->isVOP1()) {604if (ctx.chip_class == GFX8 || ctx.chip_class == GFX9)605opcode = opcode + 0x140;606else607opcode = opcode + 0x180;608} else if (instr->isVOPC()) {609opcode = opcode + 0x0;610} else if (instr->isVINTRP()) {611opcode = opcode + 0x270;612}613614uint32_t encoding;615if (ctx.chip_class <= GFX9) {616encoding = (0b110100 << 26);617} else if (ctx.chip_class >= GFX10) {618encoding = (0b110101 << 26);619} else {620unreachable("Unknown chip_class.");621}622623if (ctx.chip_class <= GFX7) {624encoding |= opcode << 17;625encoding |= (vop3.clamp ? 1 : 0) << 11;626} else {627encoding |= opcode << 16;628encoding |= (vop3.clamp ? 1 : 0) << 15;629}630encoding |= vop3.opsel << 11;631for (unsigned i = 0; i < 3; i++)632encoding |= vop3.abs[i] << (8 + i);633if (instr->definitions.size() == 2)634encoding |= instr->definitions[1].physReg() << 8;635encoding |= (0xFF & instr->definitions[0].physReg());636out.push_back(encoding);637encoding = 0;638if (instr->opcode == aco_opcode::v_interp_mov_f32) {639encoding = 0x3 & instr->operands[0].constantValue();640} else {641for (unsigned i = 0; i < instr->operands.size(); i++)642encoding |= instr->operands[i].physReg() << (i * 9);643}644encoding |= vop3.omod << 27;645for (unsigned i = 0; i < 3; i++)646encoding |= vop3.neg[i] << (29 + i);647out.push_back(encoding);648649} else if (instr->isVOP3P()) {650VOP3P_instruction& vop3 = instr->vop3p();651652uint32_t encoding;653if (ctx.chip_class == GFX9) {654encoding = (0b110100111 << 23);655} else if (ctx.chip_class >= GFX10) {656encoding = (0b110011 << 26);657} else {658unreachable("Unknown chip_class.");659}660661encoding |= opcode << 16;662encoding |= (vop3.clamp ? 1 : 0) << 15;663encoding |= vop3.opsel_lo << 11;664encoding |= ((vop3.opsel_hi & 0x4) ? 1 : 0) << 14;665for (unsigned i = 0; i < 3; i++)666encoding |= vop3.neg_hi[i] << (8 + i);667encoding |= (0xFF & instr->definitions[0].physReg());668out.push_back(encoding);669encoding = 0;670for (unsigned i = 0; i < instr->operands.size(); i++)671encoding |= instr->operands[i].physReg() << (i * 9);672encoding |= (vop3.opsel_hi & 0x3) << 27;673for (unsigned i = 0; i < 3; i++)674encoding |= vop3.neg_lo[i] << (29 + i);675out.push_back(encoding);676677} else if (instr->isDPP()) {678assert(ctx.chip_class >= GFX8);679DPP_instruction& dpp = instr->dpp();680681/* first emit the instruction without the DPP operand */682Operand dpp_op = instr->operands[0];683instr->operands[0] = Operand(PhysReg{250}, v1);684instr->format = (Format)((uint16_t)instr->format & ~(uint16_t)Format::DPP);685emit_instruction(ctx, out, instr);686uint32_t encoding = (0xF & dpp.row_mask) << 28;687encoding |= (0xF & dpp.bank_mask) << 24;688encoding |= dpp.abs[1] << 23;689encoding |= dpp.neg[1] << 22;690encoding |= dpp.abs[0] << 21;691encoding |= dpp.neg[0] << 20;692if (ctx.chip_class >= GFX10)693encoding |= 1 << 18; /* set Fetch Inactive to match GFX9 behaviour */694encoding |= dpp.bound_ctrl << 19;695encoding |= dpp.dpp_ctrl << 8;696encoding |= (0xFF) & dpp_op.physReg();697out.push_back(encoding);698return;699} else if (instr->isSDWA()) {700SDWA_instruction& sdwa = instr->sdwa();701702/* first emit the instruction without the SDWA operand */703Operand sdwa_op = instr->operands[0];704instr->operands[0] = Operand(PhysReg{249}, v1);705instr->format = (Format)((uint16_t)instr->format & ~(uint16_t)Format::SDWA);706emit_instruction(ctx, out, instr);707708uint32_t encoding = 0;709710if (instr->isVOPC()) {711if (instr->definitions[0].physReg() != vcc) {712encoding |= instr->definitions[0].physReg() << 8;713encoding |= 1 << 15;714}715encoding |= (sdwa.clamp ? 1 : 0) << 13;716} else {717encoding |= get_sdwa_sel(sdwa.dst_sel, instr->definitions[0].physReg()) << 8;718uint32_t dst_u = sdwa.dst_sel & sdwa_sext ? 1 : 0;719if (sdwa.dst_preserve || (sdwa.dst_sel & sdwa_isra))720dst_u = 2;721encoding |= dst_u << 11;722encoding |= (sdwa.clamp ? 1 : 0) << 13;723encoding |= sdwa.omod << 14;724}725726encoding |= get_sdwa_sel(sdwa.sel[0], sdwa_op.physReg()) << 16;727encoding |= sdwa.sel[0] & sdwa_sext ? 1 << 19 : 0;728encoding |= sdwa.abs[0] << 21;729encoding |= sdwa.neg[0] << 20;730731if (instr->operands.size() >= 2) {732encoding |= get_sdwa_sel(sdwa.sel[1], instr->operands[1].physReg()) << 24;733encoding |= sdwa.sel[1] & sdwa_sext ? 1 << 27 : 0;734encoding |= sdwa.abs[1] << 29;735encoding |= sdwa.neg[1] << 28;736}737738encoding |= 0xFF & sdwa_op.physReg();739encoding |= (sdwa_op.physReg() < 256) << 23;740if (instr->operands.size() >= 2)741encoding |= (instr->operands[1].physReg() < 256) << 31;742out.push_back(encoding);743} else {744unreachable("unimplemented instruction format");745}746break;747}748749/* append literal dword */750for (const Operand& op : instr->operands) {751if (op.isLiteral()) {752out.push_back(op.constantValue());753break;754}755}756}757758void759emit_block(asm_context& ctx, std::vector<uint32_t>& out, Block& block)760{761for (aco_ptr<Instruction>& instr : block.instructions) {762#if 0763int start_idx = out.size();764std::cerr << "Encoding:\t" << std::endl;765aco_print_instr(&*instr, stderr);766std::cerr << std::endl;767#endif768emit_instruction(ctx, out, instr.get());769#if 0770for (int i = start_idx; i < out.size(); i++)771std::cerr << "encoding: " << "0x" << std::setfill('0') << std::setw(8) << std::hex << out[i] << std::endl;772#endif773}774}775776void777fix_exports(asm_context& ctx, std::vector<uint32_t>& out, Program* program)778{779bool exported = false;780for (Block& block : program->blocks) {781if (!(block.kind & block_kind_export_end))782continue;783std::vector<aco_ptr<Instruction>>::reverse_iterator it = block.instructions.rbegin();784while (it != block.instructions.rend()) {785if ((*it)->isEXP()) {786Export_instruction& exp = (*it)->exp();787if (program->stage.hw == HWStage::VS || program->stage.hw == HWStage::NGG) {788if (exp.dest >= V_008DFC_SQ_EXP_POS && exp.dest <= (V_008DFC_SQ_EXP_POS + 3)) {789exp.done = true;790exported = true;791break;792}793} else {794exp.done = true;795exp.valid_mask = true;796exported = true;797break;798}799} else if ((*it)->definitions.size() && (*it)->definitions[0].physReg() == exec)800break;801++it;802}803}804805if (!exported) {806/* Abort in order to avoid a GPU hang. */807bool is_vertex_or_ngg =808(program->stage.hw == HWStage::VS || program->stage.hw == HWStage::NGG);809aco_err(program,810"Missing export in %s shader:", is_vertex_or_ngg ? "vertex or NGG" : "fragment");811aco_print_program(program, stderr);812abort();813}814}815816static void817insert_code(asm_context& ctx, std::vector<uint32_t>& out, unsigned insert_before,818unsigned insert_count, const uint32_t* insert_data)819{820out.insert(out.begin() + insert_before, insert_data, insert_data + insert_count);821822/* Update the offset of each affected block */823for (Block& block : ctx.program->blocks) {824if (block.offset >= insert_before)825block.offset += insert_count;826}827828/* Find first branch after the inserted code */829auto branch_it = std::find_if(ctx.branches.begin(), ctx.branches.end(),830[insert_before](const auto& branch) -> bool831{ return (unsigned)branch.first >= insert_before; });832833/* Update the locations of branches */834for (; branch_it != ctx.branches.end(); ++branch_it)835branch_it->first += insert_count;836837/* Update the locations of p_constaddr instructions */838for (auto& constaddr : ctx.constaddrs) {839constaddr_info& info = constaddr.second;840if (info.getpc_end >= insert_before)841info.getpc_end += insert_count;842if (info.add_literal >= insert_before)843info.add_literal += insert_count;844}845}846847static void848fix_branches_gfx10(asm_context& ctx, std::vector<uint32_t>& out)849{850/* Branches with an offset of 0x3f are buggy on GFX10,851* we workaround by inserting NOPs if needed.852*/853bool gfx10_3f_bug = false;854855do {856auto buggy_branch_it = std::find_if(857ctx.branches.begin(), ctx.branches.end(),858[&ctx](const auto& branch) -> bool {859return ((int)ctx.program->blocks[branch.second->block].offset - branch.first - 1) ==8600x3f;861});862863gfx10_3f_bug = buggy_branch_it != ctx.branches.end();864865if (gfx10_3f_bug) {866/* Insert an s_nop after the branch */867constexpr uint32_t s_nop_0 = 0xbf800000u;868insert_code(ctx, out, buggy_branch_it->first + 1, 1, &s_nop_0);869}870} while (gfx10_3f_bug);871}872873void874emit_long_jump(asm_context& ctx, SOPP_instruction* branch, bool backwards,875std::vector<uint32_t>& out)876{877Builder bld(ctx.program);878879Definition def_tmp_lo(branch->definitions[0].physReg(), s1);880Operand op_tmp_lo(branch->definitions[0].physReg(), s1);881Definition def_tmp_hi(branch->definitions[0].physReg().advance(4), s1);882Operand op_tmp_hi(branch->definitions[0].physReg().advance(4), s1);883884aco_ptr<Instruction> instr;885886if (branch->opcode != aco_opcode::s_branch) {887/* for conditional branches, skip the long jump if the condition is false */888aco_opcode inv;889switch (branch->opcode) {890case aco_opcode::s_cbranch_scc0: inv = aco_opcode::s_cbranch_scc1; break;891case aco_opcode::s_cbranch_scc1: inv = aco_opcode::s_cbranch_scc0; break;892case aco_opcode::s_cbranch_vccz: inv = aco_opcode::s_cbranch_vccnz; break;893case aco_opcode::s_cbranch_vccnz: inv = aco_opcode::s_cbranch_vccz; break;894case aco_opcode::s_cbranch_execz: inv = aco_opcode::s_cbranch_execnz; break;895case aco_opcode::s_cbranch_execnz: inv = aco_opcode::s_cbranch_execz; break;896default: unreachable("Unhandled long jump.");897}898instr.reset(bld.sopp(inv, -1, 7));899emit_instruction(ctx, out, instr.get());900}901902/* create the new PC and stash SCC in the LSB */903instr.reset(bld.sop1(aco_opcode::s_getpc_b64, branch->definitions[0]).instr);904emit_instruction(ctx, out, instr.get());905906instr.reset(bld.sop2(aco_opcode::s_addc_u32, def_tmp_lo, op_tmp_lo, Operand::zero()).instr);907instr->operands[1].setFixed(PhysReg{255}); /* this operand has to be a literal */908emit_instruction(ctx, out, instr.get());909branch->pass_flags = out.size();910911instr.reset(bld.sop2(aco_opcode::s_addc_u32, def_tmp_hi, op_tmp_hi,912Operand::c32(backwards ? UINT32_MAX : 0u))913.instr);914emit_instruction(ctx, out, instr.get());915916/* restore SCC and clear the LSB of the new PC */917instr.reset(bld.sopc(aco_opcode::s_bitcmp1_b32, def_tmp_lo, op_tmp_lo, Operand::zero()).instr);918emit_instruction(ctx, out, instr.get());919instr.reset(bld.sop1(aco_opcode::s_bitset0_b32, def_tmp_lo, Operand::zero()).instr);920emit_instruction(ctx, out, instr.get());921922/* create the s_setpc_b64 to jump */923instr.reset(924bld.sop1(aco_opcode::s_setpc_b64, Operand(branch->definitions[0].physReg(), s2)).instr);925emit_instruction(ctx, out, instr.get());926}927928void929fix_branches(asm_context& ctx, std::vector<uint32_t>& out)930{931bool repeat = false;932do {933repeat = false;934935if (ctx.chip_class == GFX10)936fix_branches_gfx10(ctx, out);937938for (std::pair<int, SOPP_instruction*>& branch : ctx.branches) {939int offset = (int)ctx.program->blocks[branch.second->block].offset - branch.first - 1;940if ((offset < INT16_MIN || offset > INT16_MAX) && !branch.second->pass_flags) {941std::vector<uint32_t> long_jump;942bool backwards =943ctx.program->blocks[branch.second->block].offset < (unsigned)branch.first;944emit_long_jump(ctx, branch.second, backwards, long_jump);945946out[branch.first] = long_jump[0];947insert_code(ctx, out, branch.first + 1, long_jump.size() - 1, long_jump.data() + 1);948949repeat = true;950break;951}952953if (branch.second->pass_flags) {954int after_getpc = branch.first + branch.second->pass_flags - 2;955offset = (int)ctx.program->blocks[branch.second->block].offset - after_getpc;956out[branch.first + branch.second->pass_flags - 1] = offset * 4;957} else {958out[branch.first] &= 0xffff0000u;959out[branch.first] |= (uint16_t)offset;960}961}962} while (repeat);963}964965void966fix_constaddrs(asm_context& ctx, std::vector<uint32_t>& out)967{968for (auto& constaddr : ctx.constaddrs) {969constaddr_info& info = constaddr.second;970out[info.add_literal] += (out.size() - info.getpc_end) * 4u;971}972}973974unsigned975emit_program(Program* program, std::vector<uint32_t>& code)976{977asm_context ctx(program);978979if (program->stage.hw == HWStage::VS || program->stage.hw == HWStage::FS ||980program->stage.hw == HWStage::NGG)981fix_exports(ctx, code, program);982983for (Block& block : program->blocks) {984block.offset = code.size();985emit_block(ctx, code, block);986}987988fix_branches(ctx, code);989990unsigned exec_size = code.size() * sizeof(uint32_t);991992if (program->chip_class >= GFX10) {993/* Pad output with s_code_end so instruction prefetching doesn't cause994* page faults */995unsigned final_size = align(code.size() + 3 * 16, 16);996while (code.size() < final_size)997code.push_back(0xbf9f0000u);998}9991000fix_constaddrs(ctx, code);10011002while (program->constant_data.size() % 4u)1003program->constant_data.push_back(0);1004/* Copy constant data */1005code.insert(code.end(), (uint32_t*)program->constant_data.data(),1006(uint32_t*)(program->constant_data.data() + program->constant_data.size()));10071008return exec_size;1009}10101011} // namespace aco101210131014