Path: blob/21.2-virgl/src/intel/compiler/brw_fs_combine_constants.cpp
4550 views
/*1* Copyright © 2014 Intel Corporation2*3* Permission is hereby granted, free of charge, to any person obtaining a4* copy of this software and associated documentation files (the "Software"),5* to deal in the Software without restriction, including without limitation6* the rights to use, copy, modify, merge, publish, distribute, sublicense,7* and/or sell copies of the Software, and to permit persons to whom the8* Software is furnished to do so, subject to the following conditions:9*10* The above copyright notice and this permission notice (including the next11* paragraph) shall be included in all copies or substantial portions of the12* Software.13*14* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR15* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,16* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL17* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER18* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING19* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS20* IN THE SOFTWARE.21*/2223/** @file brw_fs_combine_constants.cpp24*25* This file contains the opt_combine_constants() pass that runs after the26* regular optimization loop. It passes over the instruction list and27* selectively promotes immediate values to registers by emitting a mov(1)28* instruction.29*30* This is useful on Gen 7 particularly, because a few instructions can be31* coissued (i.e., issued in the same cycle as another thread on the same EU32* issues an instruction) under some circumstances, one of which is that they33* cannot use immediate values.34*/3536#include "brw_fs.h"37#include "brw_cfg.h"38#include "util/half_float.h"3940using namespace brw;4142static const bool debug = false;4344/* Returns whether an instruction could co-issue if its immediate source were45* replaced with a GRF source.46*/47static bool48could_coissue(const struct intel_device_info *devinfo, const fs_inst *inst)49{50if (devinfo->ver != 7)51return false;5253switch (inst->opcode) {54case BRW_OPCODE_MOV:55case BRW_OPCODE_CMP:56case BRW_OPCODE_ADD:57case BRW_OPCODE_MUL:58/* Only float instructions can coissue. We don't have a great59* understanding of whether or not something like float(int(a) + int(b))60* would be considered float (based on the destination type) or integer61* (based on the source types), so we take the conservative choice of62* only promoting when both destination and source are float.63*/64return inst->dst.type == BRW_REGISTER_TYPE_F &&65inst->src[0].type == BRW_REGISTER_TYPE_F;66default:67return false;68}69}7071/**72* Returns true for instructions that don't support immediate sources.73*/74static bool75must_promote_imm(const struct intel_device_info *devinfo, const fs_inst *inst)76{77switch (inst->opcode) {78case SHADER_OPCODE_POW:79return devinfo->ver < 8;80case BRW_OPCODE_MAD:81case BRW_OPCODE_LRP:82return true;83default:84return false;85}86}8788/** A box for putting fs_regs in a linked list. */89struct reg_link {90DECLARE_RALLOC_CXX_OPERATORS(reg_link)9192reg_link(fs_reg *reg) : reg(reg) {}9394struct exec_node link;95fs_reg *reg;96};9798static struct exec_node *99link(void *mem_ctx, fs_reg *reg)100{101reg_link *l = new(mem_ctx) reg_link(reg);102return &l->link;103}104105/**106* Information about an immediate value.107*/108struct imm {109/** The common ancestor of all blocks using this immediate value. */110bblock_t *block;111112/**113* The instruction generating the immediate value, if all uses are contained114* within a single basic block. Otherwise, NULL.115*/116fs_inst *inst;117118/**119* A list of fs_regs that refer to this immediate. If we promote it, we'll120* have to patch these up to refer to the new GRF.121*/122exec_list *uses;123124/** The immediate value */125union {126char bytes[8];127double df;128int64_t d64;129float f;130int32_t d;131int16_t w;132};133uint8_t size;134135/** When promoting half-float we need to account for certain restrictions */136bool is_half_float;137138/**139* The GRF register and subregister number where we've decided to store the140* constant value.141*/142uint8_t subreg_offset;143uint16_t nr;144145/** The number of coissuable instructions using this immediate. */146uint16_t uses_by_coissue;147148/**149* Whether this constant is used by an instruction that can't handle an150* immediate source (and already has to be promoted to a GRF).151*/152bool must_promote;153154uint16_t first_use_ip;155uint16_t last_use_ip;156};157158/** The working set of information about immediates. */159struct table {160struct imm *imm;161int size;162int len;163};164165static struct imm *166find_imm(struct table *table, void *data, uint8_t size)167{168for (int i = 0; i < table->len; i++) {169if (table->imm[i].size == size &&170!memcmp(table->imm[i].bytes, data, size)) {171return &table->imm[i];172}173}174return NULL;175}176177static struct imm *178new_imm(struct table *table, void *mem_ctx)179{180if (table->len == table->size) {181table->size *= 2;182table->imm = reralloc(mem_ctx, table->imm, struct imm, table->size);183}184return &table->imm[table->len++];185}186187/**188* Comparator used for sorting an array of imm structures.189*190* We sort by basic block number, then last use IP, then first use IP (least191* to greatest). This sorting causes immediates live in the same area to be192* allocated to the same register in the hopes that all values will be dead193* about the same time and the register can be reused.194*/195static int196compare(const void *_a, const void *_b)197{198const struct imm *a = (const struct imm *)_a,199*b = (const struct imm *)_b;200201int block_diff = a->block->num - b->block->num;202if (block_diff)203return block_diff;204205int end_diff = a->last_use_ip - b->last_use_ip;206if (end_diff)207return end_diff;208209return a->first_use_ip - b->first_use_ip;210}211212static bool213get_constant_value(const struct intel_device_info *devinfo,214const fs_inst *inst, uint32_t src_idx,215void *out, brw_reg_type *out_type)216{217const bool can_do_source_mods = inst->can_do_source_mods(devinfo);218const fs_reg *src = &inst->src[src_idx];219220*out_type = src->type;221222switch (*out_type) {223case BRW_REGISTER_TYPE_DF: {224double val = !can_do_source_mods ? src->df : fabs(src->df);225memcpy(out, &val, 8);226break;227}228case BRW_REGISTER_TYPE_F: {229float val = !can_do_source_mods ? src->f : fabsf(src->f);230memcpy(out, &val, 4);231break;232}233case BRW_REGISTER_TYPE_HF: {234uint16_t val = src->d & 0xffffu;235if (can_do_source_mods)236val = _mesa_float_to_half(fabsf(_mesa_half_to_float(val)));237memcpy(out, &val, 2);238break;239}240case BRW_REGISTER_TYPE_Q: {241int64_t val = !can_do_source_mods ? src->d64 : llabs(src->d64);242memcpy(out, &val, 8);243break;244}245case BRW_REGISTER_TYPE_UQ:246memcpy(out, &src->u64, 8);247break;248case BRW_REGISTER_TYPE_D: {249int32_t val = !can_do_source_mods ? src->d : abs(src->d);250memcpy(out, &val, 4);251break;252}253case BRW_REGISTER_TYPE_UD:254memcpy(out, &src->ud, 4);255break;256case BRW_REGISTER_TYPE_W: {257int16_t val = src->d & 0xffffu;258if (can_do_source_mods)259val = abs(val);260memcpy(out, &val, 2);261break;262}263case BRW_REGISTER_TYPE_UW:264memcpy(out, &src->ud, 2);265break;266default:267return false;268};269270return true;271}272273static struct brw_reg274build_imm_reg_for_copy(struct imm *imm)275{276switch (imm->size) {277case 8:278return brw_imm_d(imm->d64);279case 4:280return brw_imm_d(imm->d);281case 2:282return brw_imm_w(imm->w);283default:284unreachable("not implemented");285}286}287288static inline uint32_t289get_alignment_for_imm(const struct imm *imm)290{291if (imm->is_half_float)292return 4; /* At least MAD seems to require this */293else294return imm->size;295}296297static bool298needs_negate(const fs_reg *reg, const struct imm *imm)299{300switch (reg->type) {301case BRW_REGISTER_TYPE_DF:302return signbit(reg->df) != signbit(imm->df);303case BRW_REGISTER_TYPE_F:304return signbit(reg->f) != signbit(imm->f);305case BRW_REGISTER_TYPE_Q:306return (reg->d64 < 0) != (imm->d64 < 0);307case BRW_REGISTER_TYPE_D:308return (reg->d < 0) != (imm->d < 0);309case BRW_REGISTER_TYPE_HF:310return (reg->d & 0x8000u) != (imm->w & 0x8000u);311case BRW_REGISTER_TYPE_W:312return ((int16_t)reg->d < 0) != (imm->w < 0);313case BRW_REGISTER_TYPE_UQ:314case BRW_REGISTER_TYPE_UD:315case BRW_REGISTER_TYPE_UW:316return false;317default:318unreachable("not implemented");319};320}321322static bool323representable_as_hf(float f, uint16_t *hf)324{325union fi u;326uint16_t h = _mesa_float_to_half(f);327u.f = _mesa_half_to_float(h);328329if (u.f == f) {330*hf = h;331return true;332}333334return false;335}336337static bool338represent_src_as_imm(const struct intel_device_info *devinfo,339fs_reg *src)340{341/* TODO - Fix the codepath below to use a bfloat16 immediate on XeHP,342* since HF/F mixed mode has been removed from the hardware.343*/344if (devinfo->ver == 12 && devinfo->verx10 < 125) {345uint16_t hf;346if (representable_as_hf(src->f, &hf)) {347*src = retype(brw_imm_uw(hf), BRW_REGISTER_TYPE_HF);348return true;349}350}351return false;352}353354bool355fs_visitor::opt_combine_constants()356{357void *const_ctx = ralloc_context(NULL);358359struct table table;360table.size = 8;361table.len = 0;362table.imm = ralloc_array(const_ctx, struct imm, table.size);363364const brw::idom_tree &idom = idom_analysis.require();365unsigned ip = -1;366367/* Make a pass through all instructions and count the number of times each368* constant is used by coissueable instructions or instructions that cannot369* take immediate arguments.370*/371foreach_block_and_inst(block, fs_inst, inst, cfg) {372ip++;373374if (!could_coissue(devinfo, inst) && !must_promote_imm(devinfo, inst))375continue;376377bool represented_as_imm = false;378for (int i = 0; i < inst->sources; i++) {379if (inst->src[i].file != IMM)380continue;381382if (!represented_as_imm && i == 0 &&383inst->opcode == BRW_OPCODE_MAD &&384represent_src_as_imm(devinfo, &inst->src[i])) {385represented_as_imm = true;386continue;387}388389char data[8];390brw_reg_type type;391if (!get_constant_value(devinfo, inst, i, data, &type))392continue;393394uint8_t size = type_sz(type);395396struct imm *imm = find_imm(&table, data, size);397398if (imm) {399bblock_t *intersection = idom.intersect(block, imm->block);400if (intersection != imm->block)401imm->inst = NULL;402imm->block = intersection;403imm->uses->push_tail(link(const_ctx, &inst->src[i]));404imm->uses_by_coissue += could_coissue(devinfo, inst);405imm->must_promote = imm->must_promote || must_promote_imm(devinfo, inst);406imm->last_use_ip = ip;407if (type == BRW_REGISTER_TYPE_HF)408imm->is_half_float = true;409} else {410imm = new_imm(&table, const_ctx);411imm->block = block;412imm->inst = inst;413imm->uses = new(const_ctx) exec_list();414imm->uses->push_tail(link(const_ctx, &inst->src[i]));415memcpy(imm->bytes, data, size);416imm->size = size;417imm->is_half_float = type == BRW_REGISTER_TYPE_HF;418imm->uses_by_coissue = could_coissue(devinfo, inst);419imm->must_promote = must_promote_imm(devinfo, inst);420imm->first_use_ip = ip;421imm->last_use_ip = ip;422}423}424}425426/* Remove constants from the table that don't have enough uses to make them427* profitable to store in a register.428*/429for (int i = 0; i < table.len;) {430struct imm *imm = &table.imm[i];431432if (!imm->must_promote && imm->uses_by_coissue < 4) {433table.imm[i] = table.imm[table.len - 1];434table.len--;435continue;436}437i++;438}439if (table.len == 0) {440ralloc_free(const_ctx);441return false;442}443if (cfg->num_blocks != 1)444qsort(table.imm, table.len, sizeof(struct imm), compare);445446/* Insert MOVs to load the constant values into GRFs. */447fs_reg reg(VGRF, alloc.allocate(1));448reg.stride = 0;449for (int i = 0; i < table.len; i++) {450struct imm *imm = &table.imm[i];451/* Insert it either before the instruction that generated the immediate452* or after the last non-control flow instruction of the common ancestor.453*/454exec_node *n = (imm->inst ? imm->inst :455imm->block->last_non_control_flow_inst()->next);456457/* From the BDW and CHV PRM, 3D Media GPGPU, Special Restrictions:458*459* "In Align16 mode, the channel selects and channel enables apply to a460* pair of half-floats, because these parameters are defined for DWord461* elements ONLY. This is applicable when both source and destination462* are half-floats."463*464* This means that Align16 instructions that use promoted HF immediates465* and use a <0,1,0>:HF region would read 2 HF slots instead of466* replicating the single one we want. To avoid this, we always populate467* both HF slots within a DWord with the constant.468*/469const uint32_t width = devinfo->ver == 8 && imm->is_half_float ? 2 : 1;470const fs_builder ibld = bld.at(imm->block, n).exec_all().group(width, 0);471472/* Put the immediate in an offset aligned to its size. Some instructions473* seem to have additional alignment requirements, so account for that474* too.475*/476reg.offset = ALIGN(reg.offset, get_alignment_for_imm(imm));477478/* Ensure we have enough space in the register to copy the immediate */479struct brw_reg imm_reg = build_imm_reg_for_copy(imm);480if (reg.offset + type_sz(imm_reg.type) * width > REG_SIZE) {481reg.nr = alloc.allocate(1);482reg.offset = 0;483}484485ibld.MOV(retype(reg, imm_reg.type), imm_reg);486imm->nr = reg.nr;487imm->subreg_offset = reg.offset;488489reg.offset += imm->size * width;490}491shader_stats.promoted_constants = table.len;492493/* Rewrite the immediate sources to refer to the new GRFs. */494for (int i = 0; i < table.len; i++) {495foreach_list_typed(reg_link, link, link, table.imm[i].uses) {496fs_reg *reg = link->reg;497#ifdef DEBUG498switch (reg->type) {499case BRW_REGISTER_TYPE_DF:500assert((isnan(reg->df) && isnan(table.imm[i].df)) ||501(fabs(reg->df) == fabs(table.imm[i].df)));502break;503case BRW_REGISTER_TYPE_F:504assert((isnan(reg->f) && isnan(table.imm[i].f)) ||505(fabsf(reg->f) == fabsf(table.imm[i].f)));506break;507case BRW_REGISTER_TYPE_HF:508assert((isnan(_mesa_half_to_float(reg->d & 0xffffu)) &&509isnan(_mesa_half_to_float(table.imm[i].w))) ||510(fabsf(_mesa_half_to_float(reg->d & 0xffffu)) ==511fabsf(_mesa_half_to_float(table.imm[i].w))));512break;513case BRW_REGISTER_TYPE_Q:514assert(abs(reg->d64) == abs(table.imm[i].d64));515break;516case BRW_REGISTER_TYPE_UQ:517assert(reg->d64 == table.imm[i].d64);518break;519case BRW_REGISTER_TYPE_D:520assert(abs(reg->d) == abs(table.imm[i].d));521break;522case BRW_REGISTER_TYPE_UD:523assert(reg->d == table.imm[i].d);524break;525case BRW_REGISTER_TYPE_W:526assert(abs((int16_t) (reg->d & 0xffff)) == table.imm[i].w);527break;528case BRW_REGISTER_TYPE_UW:529assert((reg->ud & 0xffffu) == (uint16_t) table.imm[i].w);530break;531default:532break;533}534#endif535536reg->file = VGRF;537reg->offset = table.imm[i].subreg_offset;538reg->stride = 0;539reg->negate = needs_negate(reg, &table.imm[i]);540reg->nr = table.imm[i].nr;541}542}543544if (debug) {545for (int i = 0; i < table.len; i++) {546struct imm *imm = &table.imm[i];547548printf("0x%016" PRIx64 " - block %3d, reg %3d sub %2d, "549"Uses: (%2d, %2d), IP: %4d to %4d, length %4d\n",550(uint64_t)(imm->d & BITFIELD64_MASK(imm->size * 8)),551imm->block->num,552imm->nr,553imm->subreg_offset,554imm->must_promote,555imm->uses_by_coissue,556imm->first_use_ip,557imm->last_use_ip,558imm->last_use_ip - imm->first_use_ip);559}560}561562ralloc_free(const_ctx);563invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES);564565return true;566}567568569