Path: blob/21.2-virgl/src/freedreno/ir3/ir3_merge_regs.c
4565 views
/*1* Copyright (C) 2021 Valve Corporation2*3* Permission is hereby granted, free of charge, to any person obtaining a4* copy of this software and associated documentation files (the "Software"),5* to deal in the Software without restriction, including without limitation6* the rights to use, copy, modify, merge, publish, distribute, sublicense,7* and/or sell copies of the Software, and to permit persons to whom the8* Software is furnished to do so, subject to the following conditions:9*10* The above copyright notice and this permission notice (including the next11* paragraph) shall be included in all copies or substantial portions of the12* Software.13*14* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR15* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,16* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL17* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER18* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,19* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE20* SOFTWARE.21*/2223#include "ir3_compiler.h"24#include "ir3_ra.h"25#include "ralloc.h"2627/* This pass "merges" compatible phi-web SSA values. First, we insert a bunch28* of parallelcopy's to trivially turn the program into CSSA form. Then we29* try to "merge" SSA def's into "merge sets" which could be allocated to a30* single register in order to eliminate copies. First we merge phi nodes,31* which should always succeed because of the parallelcopy's we inserted, and32* then we try to coalesce the copies we introduced.33*34* The merged registers are used for three purposes:35*36* 1. We always use the same pvtmem slot for spilling all SSA defs in each37* merge set. This prevents us from having to insert memory-to-memory copies38* in the spiller and makes sure we don't insert unecessary copies.39* 2. When two values are live at the same time, part of the same merge40* set, and they overlap each other in the merge set, they always occupy41* overlapping physical registers in RA. This reduces register pressure and42* copies in several important scenarios:43* - When sources of a collect are used later by something else, we don't44* have to introduce copies.45* - We can handle sequences of extracts that "explode" a vector into its46* components without any additional copying.47* 3. We use the merge sets for affinities in register allocation: That is, we48* try to allocate all the definitions in the same merge set to the49* same/compatible registers. This helps us e.g. allocate sources of a collect50* to contiguous registers without too much special code in RA.51*52* In a "normal" register allocator, or when spilling, we'd just merge53* registers in the same merge set to the same register, but with SSA-based54* register allocation we may have to split the live interval.55*56* The implementation is based on "Revisiting Out-of-SSA Translation for57* Correctness, CodeQuality, and Efficiency," and is broadly similar to the58* implementation in nir_from_ssa, with the twist that we also try to coalesce59* META_SPLIT and META_COLLECT. This makes this pass more complicated but60* prevents us from needing to handle these specially in RA and the spiller,61* which are already complicated enough. This also forces us to implement that62* value-comparison optimization they explain, as without it we wouldn't be63* able to coalesce META_SPLIT even in the simplest of cases.64*/6566/* In order to dynamically reconstruct the dominance forest, we need the67* instructions ordered by a preorder traversal of the dominance tree:68*/6970static unsigned71index_instrs(struct ir3_block *block, unsigned index)72{73foreach_instr (instr, &block->instr_list)74instr->ip = index++;7576for (unsigned i = 0; i < block->dom_children_count; i++)77index = index_instrs(block->dom_children[i], index);7879return index;80}8182/* Definitions within a merge set are ordered by instr->ip as set above: */8384static bool85def_after(struct ir3_register *a, struct ir3_register *b)86{87return a->instr->ip > b->instr->ip;88}8990static bool91def_dominates(struct ir3_register *a, struct ir3_register *b)92{93if (def_after(a, b)) {94return false;95} else if (a->instr->block == b->instr->block) {96return def_after(b, a);97} else {98return ir3_block_dominates(a->instr->block, b->instr->block);99}100}101102/* This represents a region inside a register. The offset is relative to the103* start of the register, and offset + size <= size(reg).104*/105struct def_value {106struct ir3_register *reg;107unsigned offset, size;108};109110/* Chase any copies to get the source of a region inside a register. This is111* Value(a) in the paper.112*/113static struct def_value114chase_copies(struct def_value value)115{116while (true) {117struct ir3_instruction *instr = value.reg->instr;118if (instr->opc == OPC_META_SPLIT) {119value.offset += instr->split.off * reg_elem_size(value.reg);120value.reg = instr->srcs[0]->def;121} else if (instr->opc == OPC_META_COLLECT) {122if (value.offset % reg_elem_size(value.reg) != 0 ||123value.size > reg_elem_size(value.reg) ||124value.offset + value.size > reg_size(value.reg))125break;126struct ir3_register *src =127instr->srcs[value.offset / reg_elem_size(value.reg)];128if (!src->def)129break;130value.offset = 0;131value.reg = src->def;132} else {133/* TODO: parallelcopy */134break;135}136}137138return value;139}140141/* This represents an entry in the merge set, and consists of a register +142* offset from the merge set base.143*/144struct merge_def {145struct ir3_register *reg;146unsigned offset;147};148149static bool150can_skip_interference(const struct merge_def *a, const struct merge_def *b)151{152unsigned a_start = a->offset;153unsigned b_start = b->offset;154unsigned a_end = a_start + reg_size(a->reg);155unsigned b_end = b_start + reg_size(b->reg);156157/* Registers that don't overlap never interfere */158if (a_end <= b_start || b_end <= a_start)159return true;160161/* Disallow skipping interference unless one definition contains the162* other. This restriction is important for register allocation, because163* it means that at any given point in the program, the live values in a164* given merge set will form a tree. If they didn't, then one live value165* would partially overlap another, and they would have overlapping live166* ranges because they're live at the same point. This simplifies register167* allocation and spilling.168*/169if (!((a_start <= b_start && a_end >= b_end) ||170(b_start <= a_start && b_end >= a_end)))171return false;172173/* For each register, chase the intersection of a and b to find the174* ultimate source.175*/176unsigned start = MAX2(a_start, b_start);177unsigned end = MIN2(a_end, b_end);178struct def_value a_value = chase_copies((struct def_value){179.reg = a->reg,180.offset = start - a_start,181.size = end - start,182});183struct def_value b_value = chase_copies((struct def_value){184.reg = b->reg,185.offset = start - b_start,186.size = end - start,187});188return a_value.reg == b_value.reg && a_value.offset == b_value.offset;189}190191static struct ir3_merge_set *192get_merge_set(struct ir3_register *def)193{194if (def->merge_set)195return def->merge_set;196197struct ir3_merge_set *set = ralloc(def, struct ir3_merge_set);198set->preferred_reg = ~0;199set->interval_start = ~0;200set->size = reg_size(def);201set->alignment = (def->flags & IR3_REG_HALF) ? 1 : 2;202set->regs_count = 1;203set->regs = ralloc(set, struct ir3_register *);204set->regs[0] = def;205206return set;207}208209/* Merges b into a */210static struct ir3_merge_set *211merge_merge_sets(struct ir3_merge_set *a, struct ir3_merge_set *b, int b_offset)212{213if (b_offset < 0)214return merge_merge_sets(b, a, -b_offset);215216struct ir3_register **new_regs =217rzalloc_array(a, struct ir3_register *, a->regs_count + b->regs_count);218219unsigned a_index = 0, b_index = 0, new_index = 0;220for (; a_index < a->regs_count || b_index < b->regs_count; new_index++) {221if (b_index < b->regs_count &&222(a_index == a->regs_count ||223def_after(a->regs[a_index], b->regs[b_index]))) {224new_regs[new_index] = b->regs[b_index++];225new_regs[new_index]->merge_set_offset += b_offset;226} else {227new_regs[new_index] = a->regs[a_index++];228}229new_regs[new_index]->merge_set = a;230}231232assert(new_index == a->regs_count + b->regs_count);233234/* Technically this should be the lcm, but because alignment is only 1 or235* 2 so far this should be ok.236*/237a->alignment = MAX2(a->alignment, b->alignment);238a->regs_count += b->regs_count;239ralloc_free(a->regs);240a->regs = new_regs;241a->size = MAX2(a->size, b->size + b_offset);242243return a;244}245246static bool247merge_sets_interfere(struct ir3_liveness *live, struct ir3_merge_set *a,248struct ir3_merge_set *b, int b_offset)249{250if (b_offset < 0)251return merge_sets_interfere(live, b, a, -b_offset);252253struct merge_def dom[a->regs_count + b->regs_count];254unsigned a_index = 0, b_index = 0;255int dom_index = -1;256257/* Reject trying to merge the sets if the alignment doesn't work out */258if (b_offset % a->alignment != 0)259return true;260261while (a_index < a->regs_count || b_index < b->regs_count) {262struct merge_def current;263if (a_index == a->regs_count) {264current.reg = b->regs[b_index];265current.offset = current.reg->merge_set_offset + b_offset;266b_index++;267} else if (b_index == b->regs_count) {268current.reg = a->regs[a_index];269current.offset = current.reg->merge_set_offset;270a_index++;271} else {272if (def_after(b->regs[b_index], a->regs[a_index])) {273current.reg = a->regs[a_index];274current.offset = current.reg->merge_set_offset;275a_index++;276} else {277current.reg = b->regs[b_index];278current.offset = current.reg->merge_set_offset + b_offset;279b_index++;280}281}282283while (dom_index >= 0 &&284!def_dominates(dom[dom_index].reg, current.reg)) {285dom_index--;286}287288/* TODO: in the original paper, just dom[dom_index] needs to be289* checked for interference. We implement the value-chasing extension290* as well as support for sub-registers, which complicates this291* significantly because it's no longer the case that if a dominates b292* dominates c and a and b don't interfere then we only need to check293* interference between b and c to be sure a and c don't interfere --294* this means we may have to check for interference against values295* higher in the stack then dom[dom_index]. In the paper there's a296* description of a way to do less interference tests with the297* value-chasing extension, but we'd have to come up with something298* ourselves for handling the similar problems that come up with299* allowing values to contain subregisters. For now we just test300* everything in the stack.301*/302for (int i = 0; i <= dom_index; i++) {303if (can_skip_interference(¤t, &dom[i]))304continue;305306/* Ok, now we actually have to check interference. Since we know307* that dom[i] dominates current, this boils down to checking308* whether dom[i] is live after current.309*/310if (ir3_def_live_after(live, dom[i].reg, current.reg->instr))311return true;312}313314dom[++dom_index] = current;315}316317return false;318}319320static void321try_merge_defs(struct ir3_liveness *live, struct ir3_register *a,322struct ir3_register *b, unsigned b_offset)323{324struct ir3_merge_set *a_set = get_merge_set(a);325struct ir3_merge_set *b_set = get_merge_set(b);326327if (a_set == b_set) {328/* Note: Even in this case we may not always successfully be able to329* coalesce this copy, if the offsets don't line up. But in any330* case, we can't do anything.331*/332return;333}334335int b_set_offset = a->merge_set_offset + b_offset - b->merge_set_offset;336337if (!merge_sets_interfere(live, a_set, b_set, b_set_offset))338merge_merge_sets(a_set, b_set, b_set_offset);339}340341static void342coalesce_phi(struct ir3_liveness *live, struct ir3_instruction *phi)343{344for (unsigned i = 0; i < phi->srcs_count; i++) {345if (phi->srcs[i]->def)346try_merge_defs(live, phi->dsts[0], phi->srcs[i]->def, 0);347}348}349350static void351aggressive_coalesce_parallel_copy(struct ir3_liveness *live,352struct ir3_instruction *pcopy)353{354for (unsigned i = 0; i < pcopy->dsts_count; i++) {355if (!(pcopy->srcs[i]->flags & IR3_REG_SSA))356continue;357try_merge_defs(live, pcopy->dsts[i], pcopy->srcs[i]->def, 0);358}359}360361static void362aggressive_coalesce_split(struct ir3_liveness *live,363struct ir3_instruction *split)364{365try_merge_defs(live, split->srcs[0]->def, split->dsts[0],366split->split.off * reg_elem_size(split->dsts[0]));367}368369static void370aggressive_coalesce_collect(struct ir3_liveness *live,371struct ir3_instruction *collect)372{373for (unsigned i = 0, offset = 0; i < collect->srcs_count;374offset += reg_elem_size(collect->srcs[i]), i++) {375if (!(collect->srcs[i]->flags & IR3_REG_SSA))376continue;377try_merge_defs(live, collect->dsts[0], collect->srcs[i]->def, offset);378}379}380381static void382create_parallel_copy(struct ir3_block *block)383{384for (unsigned i = 0; i < 2; i++) {385if (!block->successors[i])386continue;387388struct ir3_block *succ = block->successors[i];389390unsigned pred_idx = ir3_block_get_pred_index(succ, block);391392unsigned phi_count = 0;393foreach_instr (phi, &succ->instr_list) {394if (phi->opc != OPC_META_PHI)395break;396397/* Avoid undef */398if ((phi->srcs[pred_idx]->flags & IR3_REG_SSA) &&399!phi->srcs[pred_idx]->def)400continue;401402/* We don't support critical edges. If we were to support them,403* we'd need to insert parallel copies after the phi node to solve404* the lost-copy problem.405*/406assert(i == 0 && !block->successors[1]);407phi_count++;408}409410if (phi_count == 0)411continue;412413struct ir3_register *src[phi_count];414unsigned j = 0;415foreach_instr (phi, &succ->instr_list) {416if (phi->opc != OPC_META_PHI)417break;418if ((phi->srcs[pred_idx]->flags & IR3_REG_SSA) &&419!phi->srcs[pred_idx]->def)420continue;421src[j++] = phi->srcs[pred_idx];422}423assert(j == phi_count);424425struct ir3_instruction *pcopy =426ir3_instr_create(block, OPC_META_PARALLEL_COPY, phi_count, phi_count);427428for (j = 0; j < phi_count; j++) {429struct ir3_register *reg = __ssa_dst(pcopy);430reg->flags |= src[j]->flags & (IR3_REG_HALF | IR3_REG_ARRAY);431reg->size = reg_elems(src[j]);432}433434for (j = 0; j < phi_count; j++) {435pcopy->srcs[pcopy->srcs_count++] =436ir3_reg_clone(block->shader, src[j]);437}438439j = 0;440foreach_instr (phi, &succ->instr_list) {441if (phi->opc != OPC_META_PHI)442break;443if ((phi->srcs[pred_idx]->flags & IR3_REG_SSA) &&444!phi->srcs[pred_idx]->def)445continue;446phi->srcs[pred_idx]->def = pcopy->dsts[j];447phi->srcs[pred_idx]->flags = pcopy->dsts[j]->flags;448j++;449}450assert(j == phi_count);451}452}453454void455ir3_create_parallel_copies(struct ir3 *ir)456{457foreach_block (block, &ir->block_list) {458create_parallel_copy(block);459}460}461462static void463index_merge_sets(struct ir3 *ir)464{465unsigned offset = 0;466foreach_block (block, &ir->block_list) {467foreach_instr (instr, &block->instr_list) {468for (unsigned i = 0; i < instr->dsts_count; i++) {469struct ir3_register *dst = instr->dsts[i];470471unsigned dst_offset;472struct ir3_merge_set *merge_set = dst->merge_set;473unsigned size = reg_size(dst);474if (merge_set) {475if (merge_set->interval_start == ~0) {476merge_set->interval_start = offset;477offset += merge_set->size;478}479dst_offset = merge_set->interval_start + dst->merge_set_offset;480} else {481dst_offset = offset;482offset += size;483}484485dst->interval_start = dst_offset;486dst->interval_end = dst_offset + size;487}488}489}490}491492#define RESET "\x1b[0m"493#define BLUE "\x1b[0;34m"494#define SYN_SSA(x) BLUE x RESET495496static void497dump_merge_sets(struct ir3 *ir)498{499printf("merge sets:\n");500struct set *merge_sets = _mesa_pointer_set_create(NULL);501foreach_block (block, &ir->block_list) {502foreach_instr (instr, &block->instr_list) {503for (unsigned i = 0; i < instr->dsts_count; i++) {504struct ir3_register *dst = instr->dsts[i];505506struct ir3_merge_set *merge_set = dst->merge_set;507if (!merge_set || _mesa_set_search(merge_sets, merge_set))508continue;509510printf("merge set, size %u, align %u:\n", merge_set->size,511merge_set->alignment);512for (unsigned j = 0; j < merge_set->regs_count; j++) {513struct ir3_register *reg = merge_set->regs[j];514printf("\t" SYN_SSA("ssa_%u") ":%u, offset %u\n",515reg->instr->serialno, reg->name, reg->merge_set_offset);516}517518_mesa_set_add(merge_sets, merge_set);519}520}521}522523ralloc_free(merge_sets);524}525526void527ir3_merge_regs(struct ir3_liveness *live, struct ir3 *ir)528{529index_instrs(ir3_start_block(ir), 0);530531/* First pass: coalesce phis, which must be together. */532foreach_block (block, &ir->block_list) {533foreach_instr (instr, &block->instr_list) {534if (instr->opc != OPC_META_PHI)535break;536537coalesce_phi(live, instr);538}539}540541/* Second pass: aggressively coalesce parallelcopy, split, collect */542foreach_block (block, &ir->block_list) {543foreach_instr (instr, &block->instr_list) {544switch (instr->opc) {545case OPC_META_SPLIT:546aggressive_coalesce_split(live, instr);547break;548case OPC_META_COLLECT:549aggressive_coalesce_collect(live, instr);550break;551case OPC_META_PARALLEL_COPY:552aggressive_coalesce_parallel_copy(live, instr);553break;554default:555break;556}557}558}559560index_merge_sets(ir);561562if (ir3_shader_debug & IR3_DBG_RAMSGS)563dump_merge_sets(ir);564}565566567