Path: blob/21.2-virgl/src/gallium/drivers/iris/iris_state.c
4565 views
/*1* Copyright © 2017 Intel Corporation2*3* Permission is hereby granted, free of charge, to any person obtaining a4* copy of this software and associated documentation files (the "Software"),5* to deal in the Software without restriction, including without limitation6* the rights to use, copy, modify, merge, publish, distribute, sublicense,7* and/or sell copies of the Software, and to permit persons to whom the8* Software is furnished to do so, subject to the following conditions:9*10* The above copyright notice and this permission notice shall be included11* in all copies or substantial portions of the Software.12*13* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS14* OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,15* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL16* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER17* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING18* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER19* DEALINGS IN THE SOFTWARE.20*/2122/**23* @file iris_state.c24*25* ============================= GENXML CODE =============================26* [This file is compiled once per generation.]27* =======================================================================28*29* This is the main state upload code.30*31* Gallium uses Constant State Objects, or CSOs, for most state. Large,32* complex, or highly reusable state can be created once, and bound and33* rebound multiple times. This is modeled with the pipe->create_*_state()34* and pipe->bind_*_state() hooks. Highly dynamic or inexpensive state is35* streamed out on the fly, via pipe->set_*_state() hooks.36*37* OpenGL involves frequently mutating context state, which is mirrored in38* core Mesa by highly mutable data structures. However, most applications39* typically draw the same things over and over - from frame to frame, most40* of the same objects are still visible and need to be redrawn. So, rather41* than inventing new state all the time, applications usually mutate to swap42* between known states that we've seen before.43*44* Gallium isolates us from this mutation by tracking API state, and45* distilling it into a set of Constant State Objects, or CSOs. Large,46* complex, or typically reusable state can be created once, then reused47* multiple times. Drivers can create and store their own associated data.48* This create/bind model corresponds to the pipe->create_*_state() and49* pipe->bind_*_state() driver hooks.50*51* Some state is cheap to create, or expected to be highly dynamic. Rather52* than creating and caching piles of CSOs for these, Gallium simply streams53* them out, via the pipe->set_*_state() driver hooks.54*55* To reduce draw time overhead, we try to compute as much state at create56* time as possible. Wherever possible, we translate the Gallium pipe state57* to 3DSTATE commands, and store those commands in the CSO. At draw time,58* we can simply memcpy them into a batch buffer.59*60* No hardware matches the abstraction perfectly, so some commands require61* information from multiple CSOs. In this case, we can store two copies62* of the packet (one in each CSO), and simply | together their DWords at63* draw time. Sometimes the second set is trivial (one or two fields), so64* we simply pack it at draw time.65*66* There are two main components in the file below. First, the CSO hooks67* create/bind/track state. The second are the draw-time upload functions,68* iris_upload_render_state() and iris_upload_compute_state(), which read69* the context state and emit the commands into the actual batch.70*/7172#include <stdio.h>73#include <errno.h>7475#if HAVE_VALGRIND76#include <valgrind.h>77#include <memcheck.h>78#define VG(x) x79#ifdef DEBUG80#define __gen_validate_value(x) VALGRIND_CHECK_MEM_IS_DEFINED(&(x), sizeof(x))81#endif82#else83#define VG(x)84#endif8586#include "pipe/p_defines.h"87#include "pipe/p_state.h"88#include "pipe/p_context.h"89#include "pipe/p_screen.h"90#include "util/u_dual_blend.h"91#include "util/u_inlines.h"92#include "util/format/u_format.h"93#include "util/u_framebuffer.h"94#include "util/u_transfer.h"95#include "util/u_upload_mgr.h"96#include "util/u_viewport.h"97#include "util/u_memory.h"98#include "drm-uapi/i915_drm.h"99#include "nir.h"100#include "intel/compiler/brw_compiler.h"101#include "intel/common/intel_aux_map.h"102#include "intel/common/intel_l3_config.h"103#include "intel/common/intel_sample_positions.h"104#include "iris_batch.h"105#include "iris_context.h"106#include "iris_defines.h"107#include "iris_pipe.h"108#include "iris_resource.h"109110#include "iris_genx_macros.h"111#include "intel/common/intel_guardband.h"112113/**114* Statically assert that PIPE_* enums match the hardware packets.115* (As long as they match, we don't need to translate them.)116*/117UNUSED static void pipe_asserts()118{119#define PIPE_ASSERT(x) STATIC_ASSERT((int)x)120121/* pipe_logicop happens to match the hardware. */122PIPE_ASSERT(PIPE_LOGICOP_CLEAR == LOGICOP_CLEAR);123PIPE_ASSERT(PIPE_LOGICOP_NOR == LOGICOP_NOR);124PIPE_ASSERT(PIPE_LOGICOP_AND_INVERTED == LOGICOP_AND_INVERTED);125PIPE_ASSERT(PIPE_LOGICOP_COPY_INVERTED == LOGICOP_COPY_INVERTED);126PIPE_ASSERT(PIPE_LOGICOP_AND_REVERSE == LOGICOP_AND_REVERSE);127PIPE_ASSERT(PIPE_LOGICOP_INVERT == LOGICOP_INVERT);128PIPE_ASSERT(PIPE_LOGICOP_XOR == LOGICOP_XOR);129PIPE_ASSERT(PIPE_LOGICOP_NAND == LOGICOP_NAND);130PIPE_ASSERT(PIPE_LOGICOP_AND == LOGICOP_AND);131PIPE_ASSERT(PIPE_LOGICOP_EQUIV == LOGICOP_EQUIV);132PIPE_ASSERT(PIPE_LOGICOP_NOOP == LOGICOP_NOOP);133PIPE_ASSERT(PIPE_LOGICOP_OR_INVERTED == LOGICOP_OR_INVERTED);134PIPE_ASSERT(PIPE_LOGICOP_COPY == LOGICOP_COPY);135PIPE_ASSERT(PIPE_LOGICOP_OR_REVERSE == LOGICOP_OR_REVERSE);136PIPE_ASSERT(PIPE_LOGICOP_OR == LOGICOP_OR);137PIPE_ASSERT(PIPE_LOGICOP_SET == LOGICOP_SET);138139/* pipe_blend_func happens to match the hardware. */140PIPE_ASSERT(PIPE_BLENDFACTOR_ONE == BLENDFACTOR_ONE);141PIPE_ASSERT(PIPE_BLENDFACTOR_SRC_COLOR == BLENDFACTOR_SRC_COLOR);142PIPE_ASSERT(PIPE_BLENDFACTOR_SRC_ALPHA == BLENDFACTOR_SRC_ALPHA);143PIPE_ASSERT(PIPE_BLENDFACTOR_DST_ALPHA == BLENDFACTOR_DST_ALPHA);144PIPE_ASSERT(PIPE_BLENDFACTOR_DST_COLOR == BLENDFACTOR_DST_COLOR);145PIPE_ASSERT(PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE == BLENDFACTOR_SRC_ALPHA_SATURATE);146PIPE_ASSERT(PIPE_BLENDFACTOR_CONST_COLOR == BLENDFACTOR_CONST_COLOR);147PIPE_ASSERT(PIPE_BLENDFACTOR_CONST_ALPHA == BLENDFACTOR_CONST_ALPHA);148PIPE_ASSERT(PIPE_BLENDFACTOR_SRC1_COLOR == BLENDFACTOR_SRC1_COLOR);149PIPE_ASSERT(PIPE_BLENDFACTOR_SRC1_ALPHA == BLENDFACTOR_SRC1_ALPHA);150PIPE_ASSERT(PIPE_BLENDFACTOR_ZERO == BLENDFACTOR_ZERO);151PIPE_ASSERT(PIPE_BLENDFACTOR_INV_SRC_COLOR == BLENDFACTOR_INV_SRC_COLOR);152PIPE_ASSERT(PIPE_BLENDFACTOR_INV_SRC_ALPHA == BLENDFACTOR_INV_SRC_ALPHA);153PIPE_ASSERT(PIPE_BLENDFACTOR_INV_DST_ALPHA == BLENDFACTOR_INV_DST_ALPHA);154PIPE_ASSERT(PIPE_BLENDFACTOR_INV_DST_COLOR == BLENDFACTOR_INV_DST_COLOR);155PIPE_ASSERT(PIPE_BLENDFACTOR_INV_CONST_COLOR == BLENDFACTOR_INV_CONST_COLOR);156PIPE_ASSERT(PIPE_BLENDFACTOR_INV_CONST_ALPHA == BLENDFACTOR_INV_CONST_ALPHA);157PIPE_ASSERT(PIPE_BLENDFACTOR_INV_SRC1_COLOR == BLENDFACTOR_INV_SRC1_COLOR);158PIPE_ASSERT(PIPE_BLENDFACTOR_INV_SRC1_ALPHA == BLENDFACTOR_INV_SRC1_ALPHA);159160/* pipe_blend_func happens to match the hardware. */161PIPE_ASSERT(PIPE_BLEND_ADD == BLENDFUNCTION_ADD);162PIPE_ASSERT(PIPE_BLEND_SUBTRACT == BLENDFUNCTION_SUBTRACT);163PIPE_ASSERT(PIPE_BLEND_REVERSE_SUBTRACT == BLENDFUNCTION_REVERSE_SUBTRACT);164PIPE_ASSERT(PIPE_BLEND_MIN == BLENDFUNCTION_MIN);165PIPE_ASSERT(PIPE_BLEND_MAX == BLENDFUNCTION_MAX);166167/* pipe_stencil_op happens to match the hardware. */168PIPE_ASSERT(PIPE_STENCIL_OP_KEEP == STENCILOP_KEEP);169PIPE_ASSERT(PIPE_STENCIL_OP_ZERO == STENCILOP_ZERO);170PIPE_ASSERT(PIPE_STENCIL_OP_REPLACE == STENCILOP_REPLACE);171PIPE_ASSERT(PIPE_STENCIL_OP_INCR == STENCILOP_INCRSAT);172PIPE_ASSERT(PIPE_STENCIL_OP_DECR == STENCILOP_DECRSAT);173PIPE_ASSERT(PIPE_STENCIL_OP_INCR_WRAP == STENCILOP_INCR);174PIPE_ASSERT(PIPE_STENCIL_OP_DECR_WRAP == STENCILOP_DECR);175PIPE_ASSERT(PIPE_STENCIL_OP_INVERT == STENCILOP_INVERT);176177/* pipe_sprite_coord_mode happens to match 3DSTATE_SBE */178PIPE_ASSERT(PIPE_SPRITE_COORD_UPPER_LEFT == UPPERLEFT);179PIPE_ASSERT(PIPE_SPRITE_COORD_LOWER_LEFT == LOWERLEFT);180#undef PIPE_ASSERT181}182183static unsigned184translate_prim_type(enum pipe_prim_type prim, uint8_t verts_per_patch)185{186static const unsigned map[] = {187[PIPE_PRIM_POINTS] = _3DPRIM_POINTLIST,188[PIPE_PRIM_LINES] = _3DPRIM_LINELIST,189[PIPE_PRIM_LINE_LOOP] = _3DPRIM_LINELOOP,190[PIPE_PRIM_LINE_STRIP] = _3DPRIM_LINESTRIP,191[PIPE_PRIM_TRIANGLES] = _3DPRIM_TRILIST,192[PIPE_PRIM_TRIANGLE_STRIP] = _3DPRIM_TRISTRIP,193[PIPE_PRIM_TRIANGLE_FAN] = _3DPRIM_TRIFAN,194[PIPE_PRIM_QUADS] = _3DPRIM_QUADLIST,195[PIPE_PRIM_QUAD_STRIP] = _3DPRIM_QUADSTRIP,196[PIPE_PRIM_POLYGON] = _3DPRIM_POLYGON,197[PIPE_PRIM_LINES_ADJACENCY] = _3DPRIM_LINELIST_ADJ,198[PIPE_PRIM_LINE_STRIP_ADJACENCY] = _3DPRIM_LINESTRIP_ADJ,199[PIPE_PRIM_TRIANGLES_ADJACENCY] = _3DPRIM_TRILIST_ADJ,200[PIPE_PRIM_TRIANGLE_STRIP_ADJACENCY] = _3DPRIM_TRISTRIP_ADJ,201[PIPE_PRIM_PATCHES] = _3DPRIM_PATCHLIST_1 - 1,202};203204return map[prim] + (prim == PIPE_PRIM_PATCHES ? verts_per_patch : 0);205}206207static unsigned208translate_compare_func(enum pipe_compare_func pipe_func)209{210static const unsigned map[] = {211[PIPE_FUNC_NEVER] = COMPAREFUNCTION_NEVER,212[PIPE_FUNC_LESS] = COMPAREFUNCTION_LESS,213[PIPE_FUNC_EQUAL] = COMPAREFUNCTION_EQUAL,214[PIPE_FUNC_LEQUAL] = COMPAREFUNCTION_LEQUAL,215[PIPE_FUNC_GREATER] = COMPAREFUNCTION_GREATER,216[PIPE_FUNC_NOTEQUAL] = COMPAREFUNCTION_NOTEQUAL,217[PIPE_FUNC_GEQUAL] = COMPAREFUNCTION_GEQUAL,218[PIPE_FUNC_ALWAYS] = COMPAREFUNCTION_ALWAYS,219};220return map[pipe_func];221}222223static unsigned224translate_shadow_func(enum pipe_compare_func pipe_func)225{226/* Gallium specifies the result of shadow comparisons as:227*228* 1 if ref <op> texel,229* 0 otherwise.230*231* The hardware does:232*233* 0 if texel <op> ref,234* 1 otherwise.235*236* So we need to flip the operator and also negate.237*/238static const unsigned map[] = {239[PIPE_FUNC_NEVER] = PREFILTEROP_ALWAYS,240[PIPE_FUNC_LESS] = PREFILTEROP_LEQUAL,241[PIPE_FUNC_EQUAL] = PREFILTEROP_NOTEQUAL,242[PIPE_FUNC_LEQUAL] = PREFILTEROP_LESS,243[PIPE_FUNC_GREATER] = PREFILTEROP_GEQUAL,244[PIPE_FUNC_NOTEQUAL] = PREFILTEROP_EQUAL,245[PIPE_FUNC_GEQUAL] = PREFILTEROP_GREATER,246[PIPE_FUNC_ALWAYS] = PREFILTEROP_NEVER,247};248return map[pipe_func];249}250251static unsigned252translate_cull_mode(unsigned pipe_face)253{254static const unsigned map[4] = {255[PIPE_FACE_NONE] = CULLMODE_NONE,256[PIPE_FACE_FRONT] = CULLMODE_FRONT,257[PIPE_FACE_BACK] = CULLMODE_BACK,258[PIPE_FACE_FRONT_AND_BACK] = CULLMODE_BOTH,259};260return map[pipe_face];261}262263static unsigned264translate_fill_mode(unsigned pipe_polymode)265{266static const unsigned map[4] = {267[PIPE_POLYGON_MODE_FILL] = FILL_MODE_SOLID,268[PIPE_POLYGON_MODE_LINE] = FILL_MODE_WIREFRAME,269[PIPE_POLYGON_MODE_POINT] = FILL_MODE_POINT,270[PIPE_POLYGON_MODE_FILL_RECTANGLE] = FILL_MODE_SOLID,271};272return map[pipe_polymode];273}274275static unsigned276translate_mip_filter(enum pipe_tex_mipfilter pipe_mip)277{278static const unsigned map[] = {279[PIPE_TEX_MIPFILTER_NEAREST] = MIPFILTER_NEAREST,280[PIPE_TEX_MIPFILTER_LINEAR] = MIPFILTER_LINEAR,281[PIPE_TEX_MIPFILTER_NONE] = MIPFILTER_NONE,282};283return map[pipe_mip];284}285286static uint32_t287translate_wrap(unsigned pipe_wrap)288{289static const unsigned map[] = {290[PIPE_TEX_WRAP_REPEAT] = TCM_WRAP,291[PIPE_TEX_WRAP_CLAMP] = TCM_HALF_BORDER,292[PIPE_TEX_WRAP_CLAMP_TO_EDGE] = TCM_CLAMP,293[PIPE_TEX_WRAP_CLAMP_TO_BORDER] = TCM_CLAMP_BORDER,294[PIPE_TEX_WRAP_MIRROR_REPEAT] = TCM_MIRROR,295[PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE] = TCM_MIRROR_ONCE,296297/* These are unsupported. */298[PIPE_TEX_WRAP_MIRROR_CLAMP] = -1,299[PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER] = -1,300};301return map[pipe_wrap];302}303304/**305* Allocate space for some indirect state.306*307* Return a pointer to the map (to fill it out) and a state ref (for308* referring to the state in GPU commands).309*/310static void *311upload_state(struct u_upload_mgr *uploader,312struct iris_state_ref *ref,313unsigned size,314unsigned alignment)315{316void *p = NULL;317u_upload_alloc(uploader, 0, size, alignment, &ref->offset, &ref->res, &p);318return p;319}320321/**322* Stream out temporary/short-lived state.323*324* This allocates space, pins the BO, and includes the BO address in the325* returned offset (which works because all state lives in 32-bit memory326* zones).327*/328static uint32_t *329stream_state(struct iris_batch *batch,330struct u_upload_mgr *uploader,331struct pipe_resource **out_res,332unsigned size,333unsigned alignment,334uint32_t *out_offset)335{336void *ptr = NULL;337338u_upload_alloc(uploader, 0, size, alignment, out_offset, out_res, &ptr);339340struct iris_bo *bo = iris_resource_bo(*out_res);341iris_use_pinned_bo(batch, bo, false, IRIS_DOMAIN_NONE);342343iris_record_state_size(batch->state_sizes,344bo->gtt_offset + *out_offset, size);345346*out_offset += iris_bo_offset_from_base_address(bo);347348return ptr;349}350351/**352* stream_state() + memcpy.353*/354static uint32_t355emit_state(struct iris_batch *batch,356struct u_upload_mgr *uploader,357struct pipe_resource **out_res,358const void *data,359unsigned size,360unsigned alignment)361{362unsigned offset = 0;363uint32_t *map =364stream_state(batch, uploader, out_res, size, alignment, &offset);365366if (map)367memcpy(map, data, size);368369return offset;370}371372/**373* Did field 'x' change between 'old_cso' and 'new_cso'?374*375* (If so, we may want to set some dirty flags.)376*/377#define cso_changed(x) (!old_cso || (old_cso->x != new_cso->x))378#define cso_changed_memcmp(x) \379(!old_cso || memcmp(old_cso->x, new_cso->x, sizeof(old_cso->x)) != 0)380381static void382flush_before_state_base_change(struct iris_batch *batch)383{384const struct intel_device_info *devinfo = &batch->screen->devinfo;385386/* Flush before emitting STATE_BASE_ADDRESS.387*388* This isn't documented anywhere in the PRM. However, it seems to be389* necessary prior to changing the surface state base address. We've390* seen issues in Vulkan where we get GPU hangs when using multi-level391* command buffers which clear depth, reset state base address, and then392* go render stuff.393*394* Normally, in GL, we would trust the kernel to do sufficient stalls395* and flushes prior to executing our batch. However, it doesn't seem396* as if the kernel's flushing is always sufficient and we don't want to397* rely on it.398*399* We make this an end-of-pipe sync instead of a normal flush because we400* do not know the current status of the GPU. On Haswell at least,401* having a fast-clear operation in flight at the same time as a normal402* rendering operation can cause hangs. Since the kernel's flushing is403* insufficient, we need to ensure that any rendering operations from404* other processes are definitely complete before we try to do our own405* rendering. It's a bit of a big hammer but it appears to work.406*/407iris_emit_end_of_pipe_sync(batch,408"change STATE_BASE_ADDRESS (flushes)",409PIPE_CONTROL_RENDER_TARGET_FLUSH |410PIPE_CONTROL_DEPTH_CACHE_FLUSH |411PIPE_CONTROL_DATA_CACHE_FLUSH |412/* Wa_1606662791:413*414* Software must program PIPE_CONTROL command415* with "HDC Pipeline Flush" prior to416* programming of the below two non-pipeline417* state :418* * STATE_BASE_ADDRESS419* * 3DSTATE_BINDING_TABLE_POOL_ALLOC420*/421((GFX_VER == 12 && devinfo->revision == 0 /* A0 */ ?422PIPE_CONTROL_FLUSH_HDC : 0)));423}424425static void426flush_after_state_base_change(struct iris_batch *batch)427{428/* After re-setting the surface state base address, we have to do some429* cache flusing so that the sampler engine will pick up the new430* SURFACE_STATE objects and binding tables. From the Broadwell PRM,431* Shared Function > 3D Sampler > State > State Caching (page 96):432*433* Coherency with system memory in the state cache, like the texture434* cache is handled partially by software. It is expected that the435* command stream or shader will issue Cache Flush operation or436* Cache_Flush sampler message to ensure that the L1 cache remains437* coherent with system memory.438*439* [...]440*441* Whenever the value of the Dynamic_State_Base_Addr,442* Surface_State_Base_Addr are altered, the L1 state cache must be443* invalidated to ensure the new surface or sampler state is fetched444* from system memory.445*446* The PIPE_CONTROL command has a "State Cache Invalidation Enable" bit447* which, according the PIPE_CONTROL instruction documentation in the448* Broadwell PRM:449*450* Setting this bit is independent of any other bit in this packet.451* This bit controls the invalidation of the L1 and L2 state caches452* at the top of the pipe i.e. at the parsing time.453*454* Unfortunately, experimentation seems to indicate that state cache455* invalidation through a PIPE_CONTROL does nothing whatsoever in456* regards to surface state and binding tables. In stead, it seems that457* invalidating the texture cache is what is actually needed.458*459* XXX: As far as we have been able to determine through460* experimentation, shows that flush the texture cache appears to be461* sufficient. The theory here is that all of the sampling/rendering462* units cache the binding table in the texture cache. However, we have463* yet to be able to actually confirm this.464*/465iris_emit_end_of_pipe_sync(batch,466"change STATE_BASE_ADDRESS (invalidates)",467PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE |468PIPE_CONTROL_CONST_CACHE_INVALIDATE |469PIPE_CONTROL_STATE_CACHE_INVALIDATE);470}471472static void473_iris_emit_lri(struct iris_batch *batch, uint32_t reg, uint32_t val)474{475iris_emit_cmd(batch, GENX(MI_LOAD_REGISTER_IMM), lri) {476lri.RegisterOffset = reg;477lri.DataDWord = val;478}479}480#define iris_emit_lri(b, r, v) _iris_emit_lri(b, GENX(r##_num), v)481482static void483_iris_emit_lrr(struct iris_batch *batch, uint32_t dst, uint32_t src)484{485iris_emit_cmd(batch, GENX(MI_LOAD_REGISTER_REG), lrr) {486lrr.SourceRegisterAddress = src;487lrr.DestinationRegisterAddress = dst;488}489}490491static void492iris_load_register_reg32(struct iris_batch *batch, uint32_t dst,493uint32_t src)494{495_iris_emit_lrr(batch, dst, src);496}497498static void499iris_load_register_reg64(struct iris_batch *batch, uint32_t dst,500uint32_t src)501{502_iris_emit_lrr(batch, dst, src);503_iris_emit_lrr(batch, dst + 4, src + 4);504}505506static void507iris_load_register_imm32(struct iris_batch *batch, uint32_t reg,508uint32_t val)509{510_iris_emit_lri(batch, reg, val);511}512513static void514iris_load_register_imm64(struct iris_batch *batch, uint32_t reg,515uint64_t val)516{517_iris_emit_lri(batch, reg + 0, val & 0xffffffff);518_iris_emit_lri(batch, reg + 4, val >> 32);519}520521/**522* Emit MI_LOAD_REGISTER_MEM to load a 32-bit MMIO register from a buffer.523*/524static void525iris_load_register_mem32(struct iris_batch *batch, uint32_t reg,526struct iris_bo *bo, uint32_t offset)527{528iris_batch_sync_region_start(batch);529iris_emit_cmd(batch, GENX(MI_LOAD_REGISTER_MEM), lrm) {530lrm.RegisterAddress = reg;531lrm.MemoryAddress = ro_bo(bo, offset);532}533iris_batch_sync_region_end(batch);534}535536/**537* Load a 64-bit value from a buffer into a MMIO register via538* two MI_LOAD_REGISTER_MEM commands.539*/540static void541iris_load_register_mem64(struct iris_batch *batch, uint32_t reg,542struct iris_bo *bo, uint32_t offset)543{544iris_load_register_mem32(batch, reg + 0, bo, offset + 0);545iris_load_register_mem32(batch, reg + 4, bo, offset + 4);546}547548static void549iris_store_register_mem32(struct iris_batch *batch, uint32_t reg,550struct iris_bo *bo, uint32_t offset,551bool predicated)552{553iris_batch_sync_region_start(batch);554iris_emit_cmd(batch, GENX(MI_STORE_REGISTER_MEM), srm) {555srm.RegisterAddress = reg;556srm.MemoryAddress = rw_bo(bo, offset, IRIS_DOMAIN_OTHER_WRITE);557srm.PredicateEnable = predicated;558}559iris_batch_sync_region_end(batch);560}561562static void563iris_store_register_mem64(struct iris_batch *batch, uint32_t reg,564struct iris_bo *bo, uint32_t offset,565bool predicated)566{567iris_store_register_mem32(batch, reg + 0, bo, offset + 0, predicated);568iris_store_register_mem32(batch, reg + 4, bo, offset + 4, predicated);569}570571static void572iris_store_data_imm32(struct iris_batch *batch,573struct iris_bo *bo, uint32_t offset,574uint32_t imm)575{576iris_batch_sync_region_start(batch);577iris_emit_cmd(batch, GENX(MI_STORE_DATA_IMM), sdi) {578sdi.Address = rw_bo(bo, offset, IRIS_DOMAIN_OTHER_WRITE);579sdi.ImmediateData = imm;580}581iris_batch_sync_region_end(batch);582}583584static void585iris_store_data_imm64(struct iris_batch *batch,586struct iris_bo *bo, uint32_t offset,587uint64_t imm)588{589/* Can't use iris_emit_cmd because MI_STORE_DATA_IMM has a length of590* 2 in genxml but it's actually variable length and we need 5 DWords.591*/592void *map = iris_get_command_space(batch, 4 * 5);593iris_batch_sync_region_start(batch);594_iris_pack_command(batch, GENX(MI_STORE_DATA_IMM), map, sdi) {595sdi.DWordLength = 5 - 2;596sdi.Address = rw_bo(bo, offset, IRIS_DOMAIN_OTHER_WRITE);597sdi.ImmediateData = imm;598}599iris_batch_sync_region_end(batch);600}601602static void603iris_copy_mem_mem(struct iris_batch *batch,604struct iris_bo *dst_bo, uint32_t dst_offset,605struct iris_bo *src_bo, uint32_t src_offset,606unsigned bytes)607{608/* MI_COPY_MEM_MEM operates on DWords. */609assert(bytes % 4 == 0);610assert(dst_offset % 4 == 0);611assert(src_offset % 4 == 0);612iris_batch_sync_region_start(batch);613614for (unsigned i = 0; i < bytes; i += 4) {615iris_emit_cmd(batch, GENX(MI_COPY_MEM_MEM), cp) {616cp.DestinationMemoryAddress = rw_bo(dst_bo, dst_offset + i,617IRIS_DOMAIN_OTHER_WRITE);618cp.SourceMemoryAddress = ro_bo(src_bo, src_offset + i);619}620}621622iris_batch_sync_region_end(batch);623}624625static void626emit_pipeline_select(struct iris_batch *batch, uint32_t pipeline)627{628#if GFX_VER >= 8 && GFX_VER < 10629/* From the Broadwell PRM, Volume 2a: Instructions, PIPELINE_SELECT:630*631* Software must clear the COLOR_CALC_STATE Valid field in632* 3DSTATE_CC_STATE_POINTERS command prior to send a PIPELINE_SELECT633* with Pipeline Select set to GPGPU.634*635* The internal hardware docs recommend the same workaround for Gfx9636* hardware too.637*/638if (pipeline == GPGPU)639iris_emit_cmd(batch, GENX(3DSTATE_CC_STATE_POINTERS), t);640#endif641642643/* From "BXML » GT » MI » vol1a GPU Overview » [Instruction]644* PIPELINE_SELECT [DevBWR+]":645*646* "Project: DEVSNB+647*648* Software must ensure all the write caches are flushed through a649* stalling PIPE_CONTROL command followed by another PIPE_CONTROL650* command to invalidate read only caches prior to programming651* MI_PIPELINE_SELECT command to change the Pipeline Select Mode."652*/653iris_emit_pipe_control_flush(batch,654"workaround: PIPELINE_SELECT flushes (1/2)",655PIPE_CONTROL_RENDER_TARGET_FLUSH |656PIPE_CONTROL_DEPTH_CACHE_FLUSH |657PIPE_CONTROL_DATA_CACHE_FLUSH |658PIPE_CONTROL_CS_STALL);659660iris_emit_pipe_control_flush(batch,661"workaround: PIPELINE_SELECT flushes (2/2)",662PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE |663PIPE_CONTROL_CONST_CACHE_INVALIDATE |664PIPE_CONTROL_STATE_CACHE_INVALIDATE |665PIPE_CONTROL_INSTRUCTION_INVALIDATE);666667iris_emit_cmd(batch, GENX(PIPELINE_SELECT), sel) {668#if GFX_VER >= 9669sel.MaskBits = GFX_VER >= 12 ? 0x13 : 3;670sel.MediaSamplerDOPClockGateEnable = GFX_VER >= 12;671#endif672sel.PipelineSelection = pipeline;673}674}675676UNUSED static void677init_glk_barrier_mode(struct iris_batch *batch, uint32_t value)678{679#if GFX_VER == 9680/* Project: DevGLK681*682* "This chicken bit works around a hardware issue with barrier683* logic encountered when switching between GPGPU and 3D pipelines.684* To workaround the issue, this mode bit should be set after a685* pipeline is selected."686*/687iris_emit_reg(batch, GENX(SLICE_COMMON_ECO_CHICKEN1), reg) {688reg.GLKBarrierMode = value;689reg.GLKBarrierModeMask = 1;690}691#endif692}693694static void695init_state_base_address(struct iris_batch *batch)696{697struct isl_device *isl_dev = &batch->screen->isl_dev;698uint32_t mocs = isl_mocs(isl_dev, 0, false);699flush_before_state_base_change(batch);700701/* We program most base addresses once at context initialization time.702* Each base address points at a 4GB memory zone, and never needs to703* change. See iris_bufmgr.h for a description of the memory zones.704*705* The one exception is Surface State Base Address, which needs to be706* updated occasionally. See iris_binder.c for the details there.707*/708iris_emit_cmd(batch, GENX(STATE_BASE_ADDRESS), sba) {709sba.GeneralStateMOCS = mocs;710sba.StatelessDataPortAccessMOCS = mocs;711sba.DynamicStateMOCS = mocs;712sba.IndirectObjectMOCS = mocs;713sba.InstructionMOCS = mocs;714sba.SurfaceStateMOCS = mocs;715716sba.GeneralStateBaseAddressModifyEnable = true;717sba.DynamicStateBaseAddressModifyEnable = true;718sba.IndirectObjectBaseAddressModifyEnable = true;719sba.InstructionBaseAddressModifyEnable = true;720sba.GeneralStateBufferSizeModifyEnable = true;721sba.DynamicStateBufferSizeModifyEnable = true;722#if (GFX_VER >= 9)723sba.BindlessSurfaceStateBaseAddress = ro_bo(NULL, IRIS_MEMZONE_BINDLESS_START);724sba.BindlessSurfaceStateSize = (IRIS_BINDLESS_SIZE >> 12) - 1;725sba.BindlessSurfaceStateBaseAddressModifyEnable = true;726sba.BindlessSurfaceStateMOCS = mocs;727#endif728sba.IndirectObjectBufferSizeModifyEnable = true;729sba.InstructionBuffersizeModifyEnable = true;730731sba.InstructionBaseAddress = ro_bo(NULL, IRIS_MEMZONE_SHADER_START);732sba.DynamicStateBaseAddress = ro_bo(NULL, IRIS_MEMZONE_DYNAMIC_START);733734sba.GeneralStateBufferSize = 0xfffff;735sba.IndirectObjectBufferSize = 0xfffff;736sba.InstructionBufferSize = 0xfffff;737sba.DynamicStateBufferSize = 0xfffff;738}739740flush_after_state_base_change(batch);741}742743static void744iris_emit_l3_config(struct iris_batch *batch,745const struct intel_l3_config *cfg)746{747assert(cfg || GFX_VER >= 12);748749#if GFX_VER >= 12750#define L3_ALLOCATION_REG GENX(L3ALLOC)751#define L3_ALLOCATION_REG_num GENX(L3ALLOC_num)752#else753#define L3_ALLOCATION_REG GENX(L3CNTLREG)754#define L3_ALLOCATION_REG_num GENX(L3CNTLREG_num)755#endif756757iris_emit_reg(batch, L3_ALLOCATION_REG, reg) {758#if GFX_VER < 11759reg.SLMEnable = cfg->n[INTEL_L3P_SLM] > 0;760#endif761#if GFX_VER == 11762/* Wa_1406697149: Bit 9 "Error Detection Behavior Control" must be set763* in L3CNTLREG register. The default setting of the bit is not the764* desirable behavior.765*/766reg.ErrorDetectionBehaviorControl = true;767reg.UseFullWays = true;768#endif769if (GFX_VER < 12 || cfg) {770reg.URBAllocation = cfg->n[INTEL_L3P_URB];771reg.ROAllocation = cfg->n[INTEL_L3P_RO];772reg.DCAllocation = cfg->n[INTEL_L3P_DC];773reg.AllAllocation = cfg->n[INTEL_L3P_ALL];774} else {775#if GFX_VER >= 12776reg.L3FullWayAllocationEnable = true;777#endif778}779}780}781782#if GFX_VER == 9783static void784iris_enable_obj_preemption(struct iris_batch *batch, bool enable)785{786/* A fixed function pipe flush is required before modifying this field */787iris_emit_end_of_pipe_sync(batch, enable ? "enable preemption"788: "disable preemption",789PIPE_CONTROL_RENDER_TARGET_FLUSH);790791/* enable object level preemption */792iris_emit_reg(batch, GENX(CS_CHICKEN1), reg) {793reg.ReplayMode = enable;794reg.ReplayModeMask = true;795}796}797#endif798799/**800* Compute an \p n x \p m pixel hashing table usable as slice, subslice or801* pixel pipe hashing table. The resulting table is the cyclic repetition of802* a fixed pattern with periodicity equal to \p period.803*804* If \p index is specified to be equal to \p period, a 2-way hashing table805* will be generated such that indices 0 and 1 are returned for the following806* fractions of entries respectively:807*808* p_0 = ceil(period / 2) / period809* p_1 = floor(period / 2) / period810*811* If \p index is even and less than \p period, a 3-way hashing table will be812* generated such that indices 0, 1 and 2 are returned for the following813* fractions of entries:814*815* p_0 = (ceil(period / 2) - 1) / period816* p_1 = floor(period / 2) / period817* p_2 = 1 / period818*819* The equations above apply if \p flip is equal to 0, if it is equal to 1 p_0820* and p_1 will be swapped for the result. Note that in the context of pixel821* pipe hashing this can be always 0 on Gfx12 platforms, since the hardware822* transparently remaps logical indices found on the table to physical pixel823* pipe indices from the highest to lowest EU count.824*/825UNUSED static void826calculate_pixel_hashing_table(unsigned n, unsigned m,827unsigned period, unsigned index, bool flip,828uint32_t *p)829{830for (unsigned i = 0; i < n; i++) {831for (unsigned j = 0; j < m; j++) {832const unsigned k = (i + j) % period;833p[j + m * i] = (k == index ? 2 : (k & 1) ^ flip);834}835}836}837838#if GFX_VER == 11839static void840gfx11_upload_pixel_hashing_tables(struct iris_batch *batch)841{842const struct intel_device_info *devinfo = &batch->screen->devinfo;843assert(devinfo->ppipe_subslices[2] == 0);844845if (devinfo->ppipe_subslices[0] == devinfo->ppipe_subslices[1])846return;847848struct iris_context *ice = batch->ice;849assert(&ice->batches[IRIS_BATCH_RENDER] == batch);850851unsigned size = GENX(SLICE_HASH_TABLE_length) * 4;852uint32_t hash_address;853struct pipe_resource *tmp = NULL;854uint32_t *map =855stream_state(batch, ice->state.dynamic_uploader, &tmp,856size, 64, &hash_address);857pipe_resource_reference(&tmp, NULL);858859const bool flip = devinfo->ppipe_subslices[0] < devinfo->ppipe_subslices[1];860struct GENX(SLICE_HASH_TABLE) table;861calculate_pixel_hashing_table(16, 16, 3, 3, flip, table.Entry[0]);862863GENX(SLICE_HASH_TABLE_pack)(NULL, map, &table);864865iris_emit_cmd(batch, GENX(3DSTATE_SLICE_TABLE_STATE_POINTERS), ptr) {866ptr.SliceHashStatePointerValid = true;867ptr.SliceHashTableStatePointer = hash_address;868}869870iris_emit_cmd(batch, GENX(3DSTATE_3D_MODE), mode) {871mode.SliceHashingTableEnable = true;872}873}874#elif GFX_VERx10 == 120875static void876gfx12_upload_pixel_hashing_tables(struct iris_batch *batch)877{878const struct intel_device_info *devinfo = &batch->screen->devinfo;879/* For each n calculate ppipes_of[n], equal to the number of pixel pipes880* present with n active dual subslices.881*/882unsigned ppipes_of[3] = {};883884for (unsigned n = 0; n < ARRAY_SIZE(ppipes_of); n++) {885for (unsigned p = 0; p < ARRAY_SIZE(devinfo->ppipe_subslices); p++)886ppipes_of[n] += (devinfo->ppipe_subslices[p] == n);887}888889/* Gfx12 has three pixel pipes. */890assert(ppipes_of[0] + ppipes_of[1] + ppipes_of[2] == 3);891892if (ppipes_of[2] == 3 || ppipes_of[0] == 2) {893/* All three pixel pipes have the maximum number of active dual894* subslices, or there is only one active pixel pipe: Nothing to do.895*/896return;897}898899iris_emit_cmd(batch, GENX(3DSTATE_SUBSLICE_HASH_TABLE), p) {900p.SliceHashControl[0] = TABLE_0;901902if (ppipes_of[2] == 2 && ppipes_of[0] == 1)903calculate_pixel_hashing_table(8, 16, 2, 2, 0, p.TwoWayTableEntry[0]);904else if (ppipes_of[2] == 1 && ppipes_of[1] == 1 && ppipes_of[0] == 1)905calculate_pixel_hashing_table(8, 16, 3, 3, 0, p.TwoWayTableEntry[0]);906907if (ppipes_of[2] == 2 && ppipes_of[1] == 1)908calculate_pixel_hashing_table(8, 16, 5, 4, 0, p.ThreeWayTableEntry[0]);909else if (ppipes_of[2] == 2 && ppipes_of[0] == 1)910calculate_pixel_hashing_table(8, 16, 2, 2, 0, p.ThreeWayTableEntry[0]);911else if (ppipes_of[2] == 1 && ppipes_of[1] == 1 && ppipes_of[0] == 1)912calculate_pixel_hashing_table(8, 16, 3, 3, 0, p.ThreeWayTableEntry[0]);913else914unreachable("Illegal fusing.");915}916917iris_emit_cmd(batch, GENX(3DSTATE_3D_MODE), p) {918p.SubsliceHashingTableEnable = true;919p.SubsliceHashingTableEnableMask = true;920}921}922#endif923924static void925iris_alloc_push_constants(struct iris_batch *batch)926{927/* For now, we set a static partitioning of the push constant area,928* assuming that all stages could be in use.929*930* TODO: Try lazily allocating the HS/DS/GS sections as needed, and931* see if that improves performance by offering more space to932* the VS/FS when those aren't in use. Also, try dynamically933* enabling/disabling it like i965 does. This would be more934* stalls and may not actually help; we don't know yet.935*/936for (int i = 0; i <= MESA_SHADER_FRAGMENT; i++) {937iris_emit_cmd(batch, GENX(3DSTATE_PUSH_CONSTANT_ALLOC_VS), alloc) {938alloc._3DCommandSubOpcode = 18 + i;939alloc.ConstantBufferOffset = 6 * i;940alloc.ConstantBufferSize = i == MESA_SHADER_FRAGMENT ? 8 : 6;941}942}943}944945#if GFX_VER >= 12946static void947init_aux_map_state(struct iris_batch *batch);948#endif949950/**951* Upload initial GPU state for any kind of context.952*953* These need to happen for both render and compute.954*/955static void956iris_init_common_context(struct iris_batch *batch)957{958#if GFX_VER == 11959iris_emit_reg(batch, GENX(SAMPLER_MODE), reg) {960reg.HeaderlessMessageforPreemptableContexts = 1;961reg.HeaderlessMessageforPreemptableContextsMask = 1;962}963964/* Bit 1 must be set in HALF_SLICE_CHICKEN7. */965iris_emit_reg(batch, GENX(HALF_SLICE_CHICKEN7), reg) {966reg.EnabledTexelOffsetPrecisionFix = 1;967reg.EnabledTexelOffsetPrecisionFixMask = 1;968}969#endif970}971972/**973* Upload the initial GPU state for a render context.974*975* This sets some invariant state that needs to be programmed a particular976* way, but we never actually change.977*/978static void979iris_init_render_context(struct iris_batch *batch)980{981UNUSED const struct intel_device_info *devinfo = &batch->screen->devinfo;982983iris_batch_sync_region_start(batch);984985emit_pipeline_select(batch, _3D);986987iris_emit_l3_config(batch, batch->screen->l3_config_3d);988989init_state_base_address(batch);990991iris_init_common_context(batch);992993#if GFX_VER >= 9994iris_emit_reg(batch, GENX(CS_DEBUG_MODE2), reg) {995reg.CONSTANT_BUFFERAddressOffsetDisable = true;996reg.CONSTANT_BUFFERAddressOffsetDisableMask = true;997}998#else999iris_emit_reg(batch, GENX(INSTPM), reg) {1000reg.CONSTANT_BUFFERAddressOffsetDisable = true;1001reg.CONSTANT_BUFFERAddressOffsetDisableMask = true;1002}1003#endif10041005#if GFX_VER == 91006iris_emit_reg(batch, GENX(CACHE_MODE_1), reg) {1007reg.FloatBlendOptimizationEnable = true;1008reg.FloatBlendOptimizationEnableMask = true;1009reg.MSCRAWHazardAvoidanceBit = true;1010reg.MSCRAWHazardAvoidanceBitMask = true;1011reg.PartialResolveDisableInVC = true;1012reg.PartialResolveDisableInVCMask = true;1013}10141015if (devinfo->is_geminilake)1016init_glk_barrier_mode(batch, GLK_BARRIER_MODE_3D_HULL);1017#endif10181019#if GFX_VER == 111020iris_emit_reg(batch, GENX(TCCNTLREG), reg) {1021reg.L3DataPartialWriteMergingEnable = true;1022reg.ColorZPartialWriteMergingEnable = true;1023reg.URBPartialWriteMergingEnable = true;1024reg.TCDisable = true;1025}10261027/* Hardware specification recommends disabling repacking for the1028* compatibility with decompression mechanism in display controller.1029*/1030if (devinfo->disable_ccs_repack) {1031iris_emit_reg(batch, GENX(CACHE_MODE_0), reg) {1032reg.DisableRepackingforCompression = true;1033reg.DisableRepackingforCompressionMask = true;1034}1035}10361037gfx11_upload_pixel_hashing_tables(batch);1038#endif10391040#if GFX_VERx10 == 1201041gfx12_upload_pixel_hashing_tables(batch);1042#endif10431044/* 3DSTATE_DRAWING_RECTANGLE is non-pipelined, so we want to avoid1045* changing it dynamically. We set it to the maximum size here, and1046* instead include the render target dimensions in the viewport, so1047* viewport extents clipping takes care of pruning stray geometry.1048*/1049iris_emit_cmd(batch, GENX(3DSTATE_DRAWING_RECTANGLE), rect) {1050rect.ClippedDrawingRectangleXMax = UINT16_MAX;1051rect.ClippedDrawingRectangleYMax = UINT16_MAX;1052}10531054/* Set the initial MSAA sample positions. */1055iris_emit_cmd(batch, GENX(3DSTATE_SAMPLE_PATTERN), pat) {1056INTEL_SAMPLE_POS_1X(pat._1xSample);1057INTEL_SAMPLE_POS_2X(pat._2xSample);1058INTEL_SAMPLE_POS_4X(pat._4xSample);1059INTEL_SAMPLE_POS_8X(pat._8xSample);1060#if GFX_VER >= 91061INTEL_SAMPLE_POS_16X(pat._16xSample);1062#endif1063}10641065/* Use the legacy AA line coverage computation. */1066iris_emit_cmd(batch, GENX(3DSTATE_AA_LINE_PARAMETERS), foo);10671068/* Disable chromakeying (it's for media) */1069iris_emit_cmd(batch, GENX(3DSTATE_WM_CHROMAKEY), foo);10701071/* We want regular rendering, not special HiZ operations. */1072iris_emit_cmd(batch, GENX(3DSTATE_WM_HZ_OP), foo);10731074/* No polygon stippling offsets are necessary. */1075/* TODO: may need to set an offset for origin-UL framebuffers */1076iris_emit_cmd(batch, GENX(3DSTATE_POLY_STIPPLE_OFFSET), foo);10771078iris_alloc_push_constants(batch);107910801081#if GFX_VER >= 121082init_aux_map_state(batch);1083#endif10841085iris_batch_sync_region_end(batch);1086}10871088static void1089iris_init_compute_context(struct iris_batch *batch)1090{1091UNUSED const struct intel_device_info *devinfo = &batch->screen->devinfo;10921093iris_batch_sync_region_start(batch);10941095/* Wa_1607854226:1096*1097* Start with pipeline in 3D mode to set the STATE_BASE_ADDRESS.1098*/1099#if GFX_VER == 121100emit_pipeline_select(batch, _3D);1101#else1102emit_pipeline_select(batch, GPGPU);1103#endif11041105iris_emit_l3_config(batch, batch->screen->l3_config_cs);11061107init_state_base_address(batch);11081109iris_init_common_context(batch);11101111#if GFX_VER == 121112emit_pipeline_select(batch, GPGPU);1113#endif11141115#if GFX_VER == 91116if (devinfo->is_geminilake)1117init_glk_barrier_mode(batch, GLK_BARRIER_MODE_GPGPU);1118#endif11191120#if GFX_VER >= 121121init_aux_map_state(batch);1122#endif11231124iris_batch_sync_region_end(batch);1125}11261127struct iris_vertex_buffer_state {1128/** The VERTEX_BUFFER_STATE hardware structure. */1129uint32_t state[GENX(VERTEX_BUFFER_STATE_length)];11301131/** The resource to source vertex data from. */1132struct pipe_resource *resource;11331134int offset;1135};11361137struct iris_depth_buffer_state {1138/* Depth/HiZ/Stencil related hardware packets. */1139uint32_t packets[GENX(3DSTATE_DEPTH_BUFFER_length) +1140GENX(3DSTATE_STENCIL_BUFFER_length) +1141GENX(3DSTATE_HIER_DEPTH_BUFFER_length) +1142GENX(3DSTATE_CLEAR_PARAMS_length) +1143GENX(MI_LOAD_REGISTER_IMM_length) * 2];1144};11451146/**1147* Generation-specific context state (ice->state.genx->...).1148*1149* Most state can go in iris_context directly, but these encode hardware1150* packets which vary by generation.1151*/1152struct iris_genx_state {1153struct iris_vertex_buffer_state vertex_buffers[33];1154uint32_t last_index_buffer[GENX(3DSTATE_INDEX_BUFFER_length)];11551156struct iris_depth_buffer_state depth_buffer;11571158uint32_t so_buffers[4 * GENX(3DSTATE_SO_BUFFER_length)];11591160#if GFX_VER == 81161bool pma_fix_enabled;1162#endif11631164#if GFX_VER == 91165/* Is object level preemption enabled? */1166bool object_preemption;1167#endif11681169struct {1170#if GFX_VER == 81171struct brw_image_param image_param[PIPE_MAX_SHADER_IMAGES];1172#endif1173} shaders[MESA_SHADER_STAGES];1174};11751176/**1177* The pipe->set_blend_color() driver hook.1178*1179* This corresponds to our COLOR_CALC_STATE.1180*/1181static void1182iris_set_blend_color(struct pipe_context *ctx,1183const struct pipe_blend_color *state)1184{1185struct iris_context *ice = (struct iris_context *) ctx;11861187/* Our COLOR_CALC_STATE is exactly pipe_blend_color, so just memcpy */1188memcpy(&ice->state.blend_color, state, sizeof(struct pipe_blend_color));1189ice->state.dirty |= IRIS_DIRTY_COLOR_CALC_STATE;1190}11911192/**1193* Gallium CSO for blend state (see pipe_blend_state).1194*/1195struct iris_blend_state {1196/** Partial 3DSTATE_PS_BLEND */1197uint32_t ps_blend[GENX(3DSTATE_PS_BLEND_length)];11981199/** Partial BLEND_STATE */1200uint32_t blend_state[GENX(BLEND_STATE_length) +1201BRW_MAX_DRAW_BUFFERS * GENX(BLEND_STATE_ENTRY_length)];12021203bool alpha_to_coverage; /* for shader key */12041205/** Bitfield of whether blending is enabled for RT[i] - for aux resolves */1206uint8_t blend_enables;12071208/** Bitfield of whether color writes are enabled for RT[i] */1209uint8_t color_write_enables;12101211/** Does RT[0] use dual color blending? */1212bool dual_color_blending;1213};12141215static enum pipe_blendfactor1216fix_blendfactor(enum pipe_blendfactor f, bool alpha_to_one)1217{1218if (alpha_to_one) {1219if (f == PIPE_BLENDFACTOR_SRC1_ALPHA)1220return PIPE_BLENDFACTOR_ONE;12211222if (f == PIPE_BLENDFACTOR_INV_SRC1_ALPHA)1223return PIPE_BLENDFACTOR_ZERO;1224}12251226return f;1227}12281229/**1230* The pipe->create_blend_state() driver hook.1231*1232* Translates a pipe_blend_state into iris_blend_state.1233*/1234static void *1235iris_create_blend_state(struct pipe_context *ctx,1236const struct pipe_blend_state *state)1237{1238struct iris_blend_state *cso = malloc(sizeof(struct iris_blend_state));1239uint32_t *blend_entry = cso->blend_state + GENX(BLEND_STATE_length);12401241cso->blend_enables = 0;1242cso->color_write_enables = 0;1243STATIC_ASSERT(BRW_MAX_DRAW_BUFFERS <= 8);12441245cso->alpha_to_coverage = state->alpha_to_coverage;12461247bool indep_alpha_blend = false;12481249for (int i = 0; i < BRW_MAX_DRAW_BUFFERS; i++) {1250const struct pipe_rt_blend_state *rt =1251&state->rt[state->independent_blend_enable ? i : 0];12521253enum pipe_blendfactor src_rgb =1254fix_blendfactor(rt->rgb_src_factor, state->alpha_to_one);1255enum pipe_blendfactor src_alpha =1256fix_blendfactor(rt->alpha_src_factor, state->alpha_to_one);1257enum pipe_blendfactor dst_rgb =1258fix_blendfactor(rt->rgb_dst_factor, state->alpha_to_one);1259enum pipe_blendfactor dst_alpha =1260fix_blendfactor(rt->alpha_dst_factor, state->alpha_to_one);12611262if (rt->rgb_func != rt->alpha_func ||1263src_rgb != src_alpha || dst_rgb != dst_alpha)1264indep_alpha_blend = true;12651266if (rt->blend_enable)1267cso->blend_enables |= 1u << i;12681269if (rt->colormask)1270cso->color_write_enables |= 1u << i;12711272iris_pack_state(GENX(BLEND_STATE_ENTRY), blend_entry, be) {1273be.LogicOpEnable = state->logicop_enable;1274be.LogicOpFunction = state->logicop_func;12751276be.PreBlendSourceOnlyClampEnable = false;1277be.ColorClampRange = COLORCLAMP_RTFORMAT;1278be.PreBlendColorClampEnable = true;1279be.PostBlendColorClampEnable = true;12801281be.ColorBufferBlendEnable = rt->blend_enable;12821283be.ColorBlendFunction = rt->rgb_func;1284be.AlphaBlendFunction = rt->alpha_func;12851286/* The casts prevent warnings about implicit enum type conversions. */1287be.SourceBlendFactor = (int) src_rgb;1288be.SourceAlphaBlendFactor = (int) src_alpha;1289be.DestinationBlendFactor = (int) dst_rgb;1290be.DestinationAlphaBlendFactor = (int) dst_alpha;12911292be.WriteDisableRed = !(rt->colormask & PIPE_MASK_R);1293be.WriteDisableGreen = !(rt->colormask & PIPE_MASK_G);1294be.WriteDisableBlue = !(rt->colormask & PIPE_MASK_B);1295be.WriteDisableAlpha = !(rt->colormask & PIPE_MASK_A);1296}1297blend_entry += GENX(BLEND_STATE_ENTRY_length);1298}12991300iris_pack_command(GENX(3DSTATE_PS_BLEND), cso->ps_blend, pb) {1301/* pb.HasWriteableRT is filled in at draw time.1302* pb.AlphaTestEnable is filled in at draw time.1303*1304* pb.ColorBufferBlendEnable is filled in at draw time so we can avoid1305* setting it when dual color blending without an appropriate shader.1306*/13071308pb.AlphaToCoverageEnable = state->alpha_to_coverage;1309pb.IndependentAlphaBlendEnable = indep_alpha_blend;13101311/* The casts prevent warnings about implicit enum type conversions. */1312pb.SourceBlendFactor =1313(int) fix_blendfactor(state->rt[0].rgb_src_factor, state->alpha_to_one);1314pb.SourceAlphaBlendFactor =1315(int) fix_blendfactor(state->rt[0].alpha_src_factor, state->alpha_to_one);1316pb.DestinationBlendFactor =1317(int) fix_blendfactor(state->rt[0].rgb_dst_factor, state->alpha_to_one);1318pb.DestinationAlphaBlendFactor =1319(int) fix_blendfactor(state->rt[0].alpha_dst_factor, state->alpha_to_one);1320}13211322iris_pack_state(GENX(BLEND_STATE), cso->blend_state, bs) {1323bs.AlphaToCoverageEnable = state->alpha_to_coverage;1324bs.IndependentAlphaBlendEnable = indep_alpha_blend;1325bs.AlphaToOneEnable = state->alpha_to_one;1326bs.AlphaToCoverageDitherEnable = state->alpha_to_coverage;1327bs.ColorDitherEnable = state->dither;1328/* bl.AlphaTestEnable and bs.AlphaTestFunction are filled in later. */1329}13301331cso->dual_color_blending = util_blend_state_is_dual(state, 0);13321333return cso;1334}13351336/**1337* The pipe->bind_blend_state() driver hook.1338*1339* Bind a blending CSO and flag related dirty bits.1340*/1341static void1342iris_bind_blend_state(struct pipe_context *ctx, void *state)1343{1344struct iris_context *ice = (struct iris_context *) ctx;1345struct iris_blend_state *cso = state;13461347ice->state.cso_blend = cso;13481349ice->state.dirty |= IRIS_DIRTY_PS_BLEND;1350ice->state.dirty |= IRIS_DIRTY_BLEND_STATE;1351ice->state.stage_dirty |= ice->state.stage_dirty_for_nos[IRIS_NOS_BLEND];13521353if (GFX_VER == 8)1354ice->state.dirty |= IRIS_DIRTY_PMA_FIX;1355}13561357/**1358* Return true if the FS writes to any color outputs which are not disabled1359* via color masking.1360*/1361static bool1362has_writeable_rt(const struct iris_blend_state *cso_blend,1363const struct shader_info *fs_info)1364{1365if (!fs_info)1366return false;13671368unsigned rt_outputs = fs_info->outputs_written >> FRAG_RESULT_DATA0;13691370if (fs_info->outputs_written & BITFIELD64_BIT(FRAG_RESULT_COLOR))1371rt_outputs = (1 << BRW_MAX_DRAW_BUFFERS) - 1;13721373return cso_blend->color_write_enables & rt_outputs;1374}13751376/**1377* Gallium CSO for depth, stencil, and alpha testing state.1378*/1379struct iris_depth_stencil_alpha_state {1380/** Partial 3DSTATE_WM_DEPTH_STENCIL. */1381uint32_t wmds[GENX(3DSTATE_WM_DEPTH_STENCIL_length)];13821383#if GFX_VER >= 121384uint32_t depth_bounds[GENX(3DSTATE_DEPTH_BOUNDS_length)];1385#endif13861387/** Outbound to BLEND_STATE, 3DSTATE_PS_BLEND, COLOR_CALC_STATE. */1388unsigned alpha_enabled:1;1389unsigned alpha_func:3; /**< PIPE_FUNC_x */1390float alpha_ref_value; /**< reference value */13911392/** Outbound to resolve and cache set tracking. */1393bool depth_writes_enabled;1394bool stencil_writes_enabled;13951396/** Outbound to Gfx8-9 PMA stall equations */1397bool depth_test_enabled;1398};13991400/**1401* The pipe->create_depth_stencil_alpha_state() driver hook.1402*1403* We encode most of 3DSTATE_WM_DEPTH_STENCIL, and just save off the alpha1404* testing state since we need pieces of it in a variety of places.1405*/1406static void *1407iris_create_zsa_state(struct pipe_context *ctx,1408const struct pipe_depth_stencil_alpha_state *state)1409{1410struct iris_depth_stencil_alpha_state *cso =1411malloc(sizeof(struct iris_depth_stencil_alpha_state));14121413bool two_sided_stencil = state->stencil[1].enabled;14141415cso->alpha_enabled = state->alpha_enabled;1416cso->alpha_func = state->alpha_func;1417cso->alpha_ref_value = state->alpha_ref_value;1418cso->depth_writes_enabled = state->depth_writemask;1419cso->depth_test_enabled = state->depth_enabled;1420cso->stencil_writes_enabled =1421state->stencil[0].writemask != 0 ||1422(two_sided_stencil && state->stencil[1].writemask != 0);14231424/* gallium frontends need to optimize away EQUAL writes for us. */1425assert(!(state->depth_func == PIPE_FUNC_EQUAL && state->depth_writemask));14261427iris_pack_command(GENX(3DSTATE_WM_DEPTH_STENCIL), cso->wmds, wmds) {1428wmds.StencilFailOp = state->stencil[0].fail_op;1429wmds.StencilPassDepthFailOp = state->stencil[0].zfail_op;1430wmds.StencilPassDepthPassOp = state->stencil[0].zpass_op;1431wmds.StencilTestFunction =1432translate_compare_func(state->stencil[0].func);1433wmds.BackfaceStencilFailOp = state->stencil[1].fail_op;1434wmds.BackfaceStencilPassDepthFailOp = state->stencil[1].zfail_op;1435wmds.BackfaceStencilPassDepthPassOp = state->stencil[1].zpass_op;1436wmds.BackfaceStencilTestFunction =1437translate_compare_func(state->stencil[1].func);1438wmds.DepthTestFunction = translate_compare_func(state->depth_func);1439wmds.DoubleSidedStencilEnable = two_sided_stencil;1440wmds.StencilTestEnable = state->stencil[0].enabled;1441wmds.StencilBufferWriteEnable =1442state->stencil[0].writemask != 0 ||1443(two_sided_stencil && state->stencil[1].writemask != 0);1444wmds.DepthTestEnable = state->depth_enabled;1445wmds.DepthBufferWriteEnable = state->depth_writemask;1446wmds.StencilTestMask = state->stencil[0].valuemask;1447wmds.StencilWriteMask = state->stencil[0].writemask;1448wmds.BackfaceStencilTestMask = state->stencil[1].valuemask;1449wmds.BackfaceStencilWriteMask = state->stencil[1].writemask;1450/* wmds.[Backface]StencilReferenceValue are merged later */1451#if GFX_VER >= 121452wmds.StencilReferenceValueModifyDisable = true;1453#endif1454}14551456#if GFX_VER >= 121457iris_pack_command(GENX(3DSTATE_DEPTH_BOUNDS), cso->depth_bounds, depth_bounds) {1458depth_bounds.DepthBoundsTestValueModifyDisable = false;1459depth_bounds.DepthBoundsTestEnableModifyDisable = false;1460depth_bounds.DepthBoundsTestEnable = state->depth_bounds_test;1461depth_bounds.DepthBoundsTestMinValue = state->depth_bounds_min;1462depth_bounds.DepthBoundsTestMaxValue = state->depth_bounds_max;1463}1464#endif14651466return cso;1467}14681469/**1470* The pipe->bind_depth_stencil_alpha_state() driver hook.1471*1472* Bind a depth/stencil/alpha CSO and flag related dirty bits.1473*/1474static void1475iris_bind_zsa_state(struct pipe_context *ctx, void *state)1476{1477struct iris_context *ice = (struct iris_context *) ctx;1478struct iris_depth_stencil_alpha_state *old_cso = ice->state.cso_zsa;1479struct iris_depth_stencil_alpha_state *new_cso = state;14801481if (new_cso) {1482if (cso_changed(alpha_ref_value))1483ice->state.dirty |= IRIS_DIRTY_COLOR_CALC_STATE;14841485if (cso_changed(alpha_enabled))1486ice->state.dirty |= IRIS_DIRTY_PS_BLEND | IRIS_DIRTY_BLEND_STATE;14871488if (cso_changed(alpha_func))1489ice->state.dirty |= IRIS_DIRTY_BLEND_STATE;14901491if (cso_changed(depth_writes_enabled) || cso_changed(stencil_writes_enabled))1492ice->state.dirty |= IRIS_DIRTY_RENDER_RESOLVES_AND_FLUSHES;14931494ice->state.depth_writes_enabled = new_cso->depth_writes_enabled;1495ice->state.stencil_writes_enabled = new_cso->stencil_writes_enabled;14961497#if GFX_VER >= 121498if (cso_changed(depth_bounds))1499ice->state.dirty |= IRIS_DIRTY_DEPTH_BOUNDS;1500#endif1501}15021503ice->state.cso_zsa = new_cso;1504ice->state.dirty |= IRIS_DIRTY_CC_VIEWPORT;1505ice->state.dirty |= IRIS_DIRTY_WM_DEPTH_STENCIL;1506ice->state.stage_dirty |=1507ice->state.stage_dirty_for_nos[IRIS_NOS_DEPTH_STENCIL_ALPHA];15081509if (GFX_VER == 8)1510ice->state.dirty |= IRIS_DIRTY_PMA_FIX;1511}15121513#if GFX_VER == 81514static bool1515want_pma_fix(struct iris_context *ice)1516{1517UNUSED struct iris_screen *screen = (void *) ice->ctx.screen;1518UNUSED const struct intel_device_info *devinfo = &screen->devinfo;1519const struct brw_wm_prog_data *wm_prog_data = (void *)1520ice->shaders.prog[MESA_SHADER_FRAGMENT]->prog_data;1521const struct pipe_framebuffer_state *cso_fb = &ice->state.framebuffer;1522const struct iris_depth_stencil_alpha_state *cso_zsa = ice->state.cso_zsa;1523const struct iris_blend_state *cso_blend = ice->state.cso_blend;15241525/* In very specific combinations of state, we can instruct Gfx8-9 hardware1526* to avoid stalling at the pixel mask array. The state equations are1527* documented in these places:1528*1529* - Gfx8 Depth PMA Fix: CACHE_MODE_1::NP_PMA_FIX_ENABLE1530* - Gfx9 Stencil PMA Fix: CACHE_MODE_0::STC PMA Optimization Enable1531*1532* Both equations share some common elements:1533*1534* no_hiz_op =1535* !(3DSTATE_WM_HZ_OP::DepthBufferClear ||1536* 3DSTATE_WM_HZ_OP::DepthBufferResolve ||1537* 3DSTATE_WM_HZ_OP::Hierarchical Depth Buffer Resolve Enable ||1538* 3DSTATE_WM_HZ_OP::StencilBufferClear) &&1539*1540* killpixels =1541* 3DSTATE_WM::ForceKillPix != ForceOff &&1542* (3DSTATE_PS_EXTRA::PixelShaderKillsPixels ||1543* 3DSTATE_PS_EXTRA::oMask Present to RenderTarget ||1544* 3DSTATE_PS_BLEND::AlphaToCoverageEnable ||1545* 3DSTATE_PS_BLEND::AlphaTestEnable ||1546* 3DSTATE_WM_CHROMAKEY::ChromaKeyKillEnable)1547*1548* (Technically the stencil PMA treats ForceKillPix differently,1549* but I think this is a documentation oversight, and we don't1550* ever use it in this way, so it doesn't matter).1551*1552* common_pma_fix =1553* 3DSTATE_WM::ForceThreadDispatch != 1 &&1554* 3DSTATE_RASTER::ForceSampleCount == NUMRASTSAMPLES_0 &&1555* 3DSTATE_DEPTH_BUFFER::SURFACE_TYPE != NULL &&1556* 3DSTATE_DEPTH_BUFFER::HIZ Enable &&1557* 3DSTATE_WM::EDSC_Mode != EDSC_PREPS &&1558* 3DSTATE_PS_EXTRA::PixelShaderValid &&1559* no_hiz_op1560*1561* These are always true:1562*1563* 3DSTATE_RASTER::ForceSampleCount == NUMRASTSAMPLES_01564* 3DSTATE_PS_EXTRA::PixelShaderValid1565*1566* Also, we never use the normal drawing path for HiZ ops; these are true:1567*1568* !(3DSTATE_WM_HZ_OP::DepthBufferClear ||1569* 3DSTATE_WM_HZ_OP::DepthBufferResolve ||1570* 3DSTATE_WM_HZ_OP::Hierarchical Depth Buffer Resolve Enable ||1571* 3DSTATE_WM_HZ_OP::StencilBufferClear)1572*1573* This happens sometimes:1574*1575* 3DSTATE_WM::ForceThreadDispatch != 11576*1577* However, we choose to ignore it as it either agrees with the signal1578* (dispatch was already enabled, so nothing out of the ordinary), or1579* there are no framebuffer attachments (so no depth or HiZ anyway,1580* meaning the PMA signal will already be disabled).1581*/15821583if (!cso_fb->zsbuf)1584return false;15851586struct iris_resource *zres, *sres;1587iris_get_depth_stencil_resources(cso_fb->zsbuf->texture, &zres, &sres);15881589/* 3DSTATE_DEPTH_BUFFER::SURFACE_TYPE != NULL &&1590* 3DSTATE_DEPTH_BUFFER::HIZ Enable &&1591*/1592if (!zres || !iris_resource_level_has_hiz(zres, cso_fb->zsbuf->u.tex.level))1593return false;15941595/* 3DSTATE_WM::EDSC_Mode != EDSC_PREPS */1596if (wm_prog_data->early_fragment_tests)1597return false;15981599/* 3DSTATE_WM::ForceKillPix != ForceOff &&1600* (3DSTATE_PS_EXTRA::PixelShaderKillsPixels ||1601* 3DSTATE_PS_EXTRA::oMask Present to RenderTarget ||1602* 3DSTATE_PS_BLEND::AlphaToCoverageEnable ||1603* 3DSTATE_PS_BLEND::AlphaTestEnable ||1604* 3DSTATE_WM_CHROMAKEY::ChromaKeyKillEnable)1605*/1606bool killpixels = wm_prog_data->uses_kill || wm_prog_data->uses_omask ||1607cso_blend->alpha_to_coverage || cso_zsa->alpha_enabled;16081609/* The Gfx8 depth PMA equation becomes:1610*1611* depth_writes =1612* 3DSTATE_WM_DEPTH_STENCIL::DepthWriteEnable &&1613* 3DSTATE_DEPTH_BUFFER::DEPTH_WRITE_ENABLE1614*1615* stencil_writes =1616* 3DSTATE_WM_DEPTH_STENCIL::Stencil Buffer Write Enable &&1617* 3DSTATE_DEPTH_BUFFER::STENCIL_WRITE_ENABLE &&1618* 3DSTATE_STENCIL_BUFFER::STENCIL_BUFFER_ENABLE1619*1620* Z_PMA_OPT =1621* common_pma_fix &&1622* 3DSTATE_WM_DEPTH_STENCIL::DepthTestEnable &&1623* ((killpixels && (depth_writes || stencil_writes)) ||1624* 3DSTATE_PS_EXTRA::PixelShaderComputedDepthMode != PSCDEPTH_OFF)1625*1626*/1627if (!cso_zsa->depth_test_enabled)1628return false;16291630return wm_prog_data->computed_depth_mode != PSCDEPTH_OFF ||1631(killpixels && (cso_zsa->depth_writes_enabled ||1632(sres && cso_zsa->stencil_writes_enabled)));1633}1634#endif16351636void1637genX(update_pma_fix)(struct iris_context *ice,1638struct iris_batch *batch,1639bool enable)1640{1641#if GFX_VER == 81642struct iris_genx_state *genx = ice->state.genx;16431644if (genx->pma_fix_enabled == enable)1645return;16461647genx->pma_fix_enabled = enable;16481649/* According to the Broadwell PIPE_CONTROL documentation, software should1650* emit a PIPE_CONTROL with the CS Stall and Depth Cache Flush bits set1651* prior to the LRI. If stencil buffer writes are enabled, then a Render * Cache Flush is also necessary.1652*1653* The Gfx9 docs say to use a depth stall rather than a command streamer1654* stall. However, the hardware seems to violently disagree. A full1655* command streamer stall seems to be needed in both cases.1656*/1657iris_emit_pipe_control_flush(batch, "PMA fix change (1/2)",1658PIPE_CONTROL_CS_STALL |1659PIPE_CONTROL_DEPTH_CACHE_FLUSH |1660PIPE_CONTROL_RENDER_TARGET_FLUSH);16611662iris_emit_reg(batch, GENX(CACHE_MODE_1), reg) {1663reg.NPPMAFixEnable = enable;1664reg.NPEarlyZFailsDisable = enable;1665reg.NPPMAFixEnableMask = true;1666reg.NPEarlyZFailsDisableMask = true;1667}16681669/* After the LRI, a PIPE_CONTROL with both the Depth Stall and Depth Cache1670* Flush bits is often necessary. We do it regardless because it's easier.1671* The render cache flush is also necessary if stencil writes are enabled.1672*1673* Again, the Gfx9 docs give a different set of flushes but the Broadwell1674* flushes seem to work just as well.1675*/1676iris_emit_pipe_control_flush(batch, "PMA fix change (1/2)",1677PIPE_CONTROL_DEPTH_STALL |1678PIPE_CONTROL_DEPTH_CACHE_FLUSH |1679PIPE_CONTROL_RENDER_TARGET_FLUSH);1680#endif1681}16821683/**1684* Gallium CSO for rasterizer state.1685*/1686struct iris_rasterizer_state {1687uint32_t sf[GENX(3DSTATE_SF_length)];1688uint32_t clip[GENX(3DSTATE_CLIP_length)];1689uint32_t raster[GENX(3DSTATE_RASTER_length)];1690uint32_t wm[GENX(3DSTATE_WM_length)];1691uint32_t line_stipple[GENX(3DSTATE_LINE_STIPPLE_length)];16921693uint8_t num_clip_plane_consts;1694bool clip_halfz; /* for CC_VIEWPORT */1695bool depth_clip_near; /* for CC_VIEWPORT */1696bool depth_clip_far; /* for CC_VIEWPORT */1697bool flatshade; /* for shader state */1698bool flatshade_first; /* for stream output */1699bool clamp_fragment_color; /* for shader state */1700bool light_twoside; /* for shader state */1701bool rasterizer_discard; /* for 3DSTATE_STREAMOUT and 3DSTATE_CLIP */1702bool half_pixel_center; /* for 3DSTATE_MULTISAMPLE */1703bool line_stipple_enable;1704bool poly_stipple_enable;1705bool multisample;1706bool force_persample_interp;1707bool conservative_rasterization;1708bool fill_mode_point;1709bool fill_mode_line;1710bool fill_mode_point_or_line;1711enum pipe_sprite_coord_mode sprite_coord_mode; /* PIPE_SPRITE_* */1712uint16_t sprite_coord_enable;1713};17141715static float1716get_line_width(const struct pipe_rasterizer_state *state)1717{1718float line_width = state->line_width;17191720/* From the OpenGL 4.4 spec:1721*1722* "The actual width of non-antialiased lines is determined by rounding1723* the supplied width to the nearest integer, then clamping it to the1724* implementation-dependent maximum non-antialiased line width."1725*/1726if (!state->multisample && !state->line_smooth)1727line_width = roundf(state->line_width);17281729if (!state->multisample && state->line_smooth && line_width < 1.5f) {1730/* For 1 pixel line thickness or less, the general anti-aliasing1731* algorithm gives up, and a garbage line is generated. Setting a1732* Line Width of 0.0 specifies the rasterization of the "thinnest"1733* (one-pixel-wide), non-antialiased lines.1734*1735* Lines rendered with zero Line Width are rasterized using the1736* "Grid Intersection Quantization" rules as specified by the1737* "Zero-Width (Cosmetic) Line Rasterization" section of the docs.1738*/1739line_width = 0.0f;1740}17411742return line_width;1743}17441745/**1746* The pipe->create_rasterizer_state() driver hook.1747*/1748static void *1749iris_create_rasterizer_state(struct pipe_context *ctx,1750const struct pipe_rasterizer_state *state)1751{1752struct iris_rasterizer_state *cso =1753malloc(sizeof(struct iris_rasterizer_state));17541755cso->multisample = state->multisample;1756cso->force_persample_interp = state->force_persample_interp;1757cso->clip_halfz = state->clip_halfz;1758cso->depth_clip_near = state->depth_clip_near;1759cso->depth_clip_far = state->depth_clip_far;1760cso->flatshade = state->flatshade;1761cso->flatshade_first = state->flatshade_first;1762cso->clamp_fragment_color = state->clamp_fragment_color;1763cso->light_twoside = state->light_twoside;1764cso->rasterizer_discard = state->rasterizer_discard;1765cso->half_pixel_center = state->half_pixel_center;1766cso->sprite_coord_mode = state->sprite_coord_mode;1767cso->sprite_coord_enable = state->sprite_coord_enable;1768cso->line_stipple_enable = state->line_stipple_enable;1769cso->poly_stipple_enable = state->poly_stipple_enable;1770cso->conservative_rasterization =1771state->conservative_raster_mode == PIPE_CONSERVATIVE_RASTER_POST_SNAP;17721773cso->fill_mode_point =1774state->fill_front == PIPE_POLYGON_MODE_POINT ||1775state->fill_back == PIPE_POLYGON_MODE_POINT;1776cso->fill_mode_line =1777state->fill_front == PIPE_POLYGON_MODE_LINE ||1778state->fill_back == PIPE_POLYGON_MODE_LINE;1779cso->fill_mode_point_or_line =1780cso->fill_mode_point ||1781cso->fill_mode_line;17821783if (state->clip_plane_enable != 0)1784cso->num_clip_plane_consts = util_logbase2(state->clip_plane_enable) + 1;1785else1786cso->num_clip_plane_consts = 0;17871788float line_width = get_line_width(state);17891790iris_pack_command(GENX(3DSTATE_SF), cso->sf, sf) {1791sf.StatisticsEnable = true;1792sf.AALineDistanceMode = AALINEDISTANCE_TRUE;1793sf.LineEndCapAntialiasingRegionWidth =1794state->line_smooth ? _10pixels : _05pixels;1795sf.LastPixelEnable = state->line_last_pixel;1796sf.LineWidth = line_width;1797sf.SmoothPointEnable = (state->point_smooth || state->multisample) &&1798!state->point_quad_rasterization;1799sf.PointWidthSource = state->point_size_per_vertex ? Vertex : State;1800sf.PointWidth = CLAMP(state->point_size, 0.125f, 255.875f);18011802if (state->flatshade_first) {1803sf.TriangleFanProvokingVertexSelect = 1;1804} else {1805sf.TriangleStripListProvokingVertexSelect = 2;1806sf.TriangleFanProvokingVertexSelect = 2;1807sf.LineStripListProvokingVertexSelect = 1;1808}1809}18101811iris_pack_command(GENX(3DSTATE_RASTER), cso->raster, rr) {1812rr.FrontWinding = state->front_ccw ? CounterClockwise : Clockwise;1813rr.CullMode = translate_cull_mode(state->cull_face);1814rr.FrontFaceFillMode = translate_fill_mode(state->fill_front);1815rr.BackFaceFillMode = translate_fill_mode(state->fill_back);1816rr.DXMultisampleRasterizationEnable = state->multisample;1817rr.GlobalDepthOffsetEnableSolid = state->offset_tri;1818rr.GlobalDepthOffsetEnableWireframe = state->offset_line;1819rr.GlobalDepthOffsetEnablePoint = state->offset_point;1820rr.GlobalDepthOffsetConstant = state->offset_units * 2;1821rr.GlobalDepthOffsetScale = state->offset_scale;1822rr.GlobalDepthOffsetClamp = state->offset_clamp;1823rr.SmoothPointEnable = state->point_smooth;1824rr.AntialiasingEnable = state->line_smooth;1825rr.ScissorRectangleEnable = state->scissor;1826#if GFX_VER >= 91827rr.ViewportZNearClipTestEnable = state->depth_clip_near;1828rr.ViewportZFarClipTestEnable = state->depth_clip_far;1829rr.ConservativeRasterizationEnable =1830cso->conservative_rasterization;1831#else1832rr.ViewportZClipTestEnable = (state->depth_clip_near || state->depth_clip_far);1833#endif1834}18351836iris_pack_command(GENX(3DSTATE_CLIP), cso->clip, cl) {1837/* cl.NonPerspectiveBarycentricEnable is filled in at draw time from1838* the FS program; cl.ForceZeroRTAIndexEnable is filled in from the FB.1839*/1840cl.EarlyCullEnable = true;1841cl.UserClipDistanceClipTestEnableBitmask = state->clip_plane_enable;1842cl.ForceUserClipDistanceClipTestEnableBitmask = true;1843cl.APIMode = state->clip_halfz ? APIMODE_D3D : APIMODE_OGL;1844cl.GuardbandClipTestEnable = true;1845cl.ClipEnable = true;1846cl.MinimumPointWidth = 0.125;1847cl.MaximumPointWidth = 255.875;18481849if (state->flatshade_first) {1850cl.TriangleFanProvokingVertexSelect = 1;1851} else {1852cl.TriangleStripListProvokingVertexSelect = 2;1853cl.TriangleFanProvokingVertexSelect = 2;1854cl.LineStripListProvokingVertexSelect = 1;1855}1856}18571858iris_pack_command(GENX(3DSTATE_WM), cso->wm, wm) {1859/* wm.BarycentricInterpolationMode and wm.EarlyDepthStencilControl are1860* filled in at draw time from the FS program.1861*/1862wm.LineAntialiasingRegionWidth = _10pixels;1863wm.LineEndCapAntialiasingRegionWidth = _05pixels;1864wm.PointRasterizationRule = RASTRULE_UPPER_RIGHT;1865wm.LineStippleEnable = state->line_stipple_enable;1866wm.PolygonStippleEnable = state->poly_stipple_enable;1867}18681869/* Remap from 0..255 back to 1..256 */1870const unsigned line_stipple_factor = state->line_stipple_factor + 1;18711872iris_pack_command(GENX(3DSTATE_LINE_STIPPLE), cso->line_stipple, line) {1873if (state->line_stipple_enable) {1874line.LineStipplePattern = state->line_stipple_pattern;1875line.LineStippleInverseRepeatCount = 1.0f / line_stipple_factor;1876line.LineStippleRepeatCount = line_stipple_factor;1877}1878}18791880return cso;1881}18821883/**1884* The pipe->bind_rasterizer_state() driver hook.1885*1886* Bind a rasterizer CSO and flag related dirty bits.1887*/1888static void1889iris_bind_rasterizer_state(struct pipe_context *ctx, void *state)1890{1891struct iris_context *ice = (struct iris_context *) ctx;1892struct iris_rasterizer_state *old_cso = ice->state.cso_rast;1893struct iris_rasterizer_state *new_cso = state;18941895if (new_cso) {1896/* Try to avoid re-emitting 3DSTATE_LINE_STIPPLE, it's non-pipelined */1897if (cso_changed_memcmp(line_stipple))1898ice->state.dirty |= IRIS_DIRTY_LINE_STIPPLE;18991900if (cso_changed(half_pixel_center))1901ice->state.dirty |= IRIS_DIRTY_MULTISAMPLE;19021903if (cso_changed(line_stipple_enable) || cso_changed(poly_stipple_enable))1904ice->state.dirty |= IRIS_DIRTY_WM;19051906if (cso_changed(rasterizer_discard))1907ice->state.dirty |= IRIS_DIRTY_STREAMOUT | IRIS_DIRTY_CLIP;19081909if (cso_changed(flatshade_first))1910ice->state.dirty |= IRIS_DIRTY_STREAMOUT;19111912if (cso_changed(depth_clip_near) || cso_changed(depth_clip_far) ||1913cso_changed(clip_halfz))1914ice->state.dirty |= IRIS_DIRTY_CC_VIEWPORT;19151916if (cso_changed(sprite_coord_enable) ||1917cso_changed(sprite_coord_mode) ||1918cso_changed(light_twoside))1919ice->state.dirty |= IRIS_DIRTY_SBE;19201921if (cso_changed(conservative_rasterization))1922ice->state.stage_dirty |= IRIS_STAGE_DIRTY_FS;1923}19241925ice->state.cso_rast = new_cso;1926ice->state.dirty |= IRIS_DIRTY_RASTER;1927ice->state.dirty |= IRIS_DIRTY_CLIP;1928ice->state.stage_dirty |=1929ice->state.stage_dirty_for_nos[IRIS_NOS_RASTERIZER];1930}19311932/**1933* Return true if the given wrap mode requires the border color to exist.1934*1935* (We can skip uploading it if the sampler isn't going to use it.)1936*/1937static bool1938wrap_mode_needs_border_color(unsigned wrap_mode)1939{1940return wrap_mode == TCM_CLAMP_BORDER || wrap_mode == TCM_HALF_BORDER;1941}19421943/**1944* Gallium CSO for sampler state.1945*/1946struct iris_sampler_state {1947union pipe_color_union border_color;1948bool needs_border_color;19491950uint32_t sampler_state[GENX(SAMPLER_STATE_length)];1951};19521953/**1954* The pipe->create_sampler_state() driver hook.1955*1956* We fill out SAMPLER_STATE (except for the border color pointer), and1957* store that on the CPU. It doesn't make sense to upload it to a GPU1958* buffer object yet, because 3DSTATE_SAMPLER_STATE_POINTERS requires1959* all bound sampler states to be in contiguous memor.1960*/1961static void *1962iris_create_sampler_state(struct pipe_context *ctx,1963const struct pipe_sampler_state *state)1964{1965struct iris_sampler_state *cso = CALLOC_STRUCT(iris_sampler_state);19661967if (!cso)1968return NULL;19691970STATIC_ASSERT(PIPE_TEX_FILTER_NEAREST == MAPFILTER_NEAREST);1971STATIC_ASSERT(PIPE_TEX_FILTER_LINEAR == MAPFILTER_LINEAR);19721973unsigned wrap_s = translate_wrap(state->wrap_s);1974unsigned wrap_t = translate_wrap(state->wrap_t);1975unsigned wrap_r = translate_wrap(state->wrap_r);19761977memcpy(&cso->border_color, &state->border_color, sizeof(cso->border_color));19781979cso->needs_border_color = wrap_mode_needs_border_color(wrap_s) ||1980wrap_mode_needs_border_color(wrap_t) ||1981wrap_mode_needs_border_color(wrap_r);19821983float min_lod = state->min_lod;1984unsigned mag_img_filter = state->mag_img_filter;19851986// XXX: explain this code ported from ilo...I don't get it at all...1987if (state->min_mip_filter == PIPE_TEX_MIPFILTER_NONE &&1988state->min_lod > 0.0f) {1989min_lod = 0.0f;1990mag_img_filter = state->min_img_filter;1991}19921993iris_pack_state(GENX(SAMPLER_STATE), cso->sampler_state, samp) {1994samp.TCXAddressControlMode = wrap_s;1995samp.TCYAddressControlMode = wrap_t;1996samp.TCZAddressControlMode = wrap_r;1997samp.CubeSurfaceControlMode = state->seamless_cube_map;1998samp.NonnormalizedCoordinateEnable = !state->normalized_coords;1999samp.MinModeFilter = state->min_img_filter;2000samp.MagModeFilter = mag_img_filter;2001samp.MipModeFilter = translate_mip_filter(state->min_mip_filter);2002samp.MaximumAnisotropy = RATIO21;20032004if (state->max_anisotropy >= 2) {2005if (state->min_img_filter == PIPE_TEX_FILTER_LINEAR) {2006samp.MinModeFilter = MAPFILTER_ANISOTROPIC;2007samp.AnisotropicAlgorithm = EWAApproximation;2008}20092010if (state->mag_img_filter == PIPE_TEX_FILTER_LINEAR)2011samp.MagModeFilter = MAPFILTER_ANISOTROPIC;20122013samp.MaximumAnisotropy =2014MIN2((state->max_anisotropy - 2) / 2, RATIO161);2015}20162017/* Set address rounding bits if not using nearest filtering. */2018if (state->min_img_filter != PIPE_TEX_FILTER_NEAREST) {2019samp.UAddressMinFilterRoundingEnable = true;2020samp.VAddressMinFilterRoundingEnable = true;2021samp.RAddressMinFilterRoundingEnable = true;2022}20232024if (state->mag_img_filter != PIPE_TEX_FILTER_NEAREST) {2025samp.UAddressMagFilterRoundingEnable = true;2026samp.VAddressMagFilterRoundingEnable = true;2027samp.RAddressMagFilterRoundingEnable = true;2028}20292030if (state->compare_mode == PIPE_TEX_COMPARE_R_TO_TEXTURE)2031samp.ShadowFunction = translate_shadow_func(state->compare_func);20322033const float hw_max_lod = GFX_VER >= 7 ? 14 : 13;20342035samp.LODPreClampMode = CLAMP_MODE_OGL;2036samp.MinLOD = CLAMP(min_lod, 0, hw_max_lod);2037samp.MaxLOD = CLAMP(state->max_lod, 0, hw_max_lod);2038samp.TextureLODBias = CLAMP(state->lod_bias, -16, 15);20392040/* .BorderColorPointer is filled in by iris_bind_sampler_states. */2041}20422043return cso;2044}20452046/**2047* The pipe->bind_sampler_states() driver hook.2048*/2049static void2050iris_bind_sampler_states(struct pipe_context *ctx,2051enum pipe_shader_type p_stage,2052unsigned start, unsigned count,2053void **states)2054{2055struct iris_context *ice = (struct iris_context *) ctx;2056gl_shader_stage stage = stage_from_pipe(p_stage);2057struct iris_shader_state *shs = &ice->state.shaders[stage];20582059assert(start + count <= IRIS_MAX_TEXTURE_SAMPLERS);20602061bool dirty = false;20622063for (int i = 0; i < count; i++) {2064if (shs->samplers[start + i] != states[i]) {2065shs->samplers[start + i] = states[i];2066dirty = true;2067}2068}20692070if (dirty)2071ice->state.stage_dirty |= IRIS_STAGE_DIRTY_SAMPLER_STATES_VS << stage;2072}20732074/**2075* Upload the sampler states into a contiguous area of GPU memory, for2076* for 3DSTATE_SAMPLER_STATE_POINTERS_*.2077*2078* Also fill out the border color state pointers.2079*/2080static void2081iris_upload_sampler_states(struct iris_context *ice, gl_shader_stage stage)2082{2083struct iris_shader_state *shs = &ice->state.shaders[stage];2084const struct shader_info *info = iris_get_shader_info(ice, stage);20852086/* We assume gallium frontends will call pipe->bind_sampler_states()2087* if the program's number of textures changes.2088*/2089unsigned count = info ? BITSET_LAST_BIT(info->textures_used) : 0;20902091if (!count)2092return;20932094/* Assemble the SAMPLER_STATEs into a contiguous table that lives2095* in the dynamic state memory zone, so we can point to it via the2096* 3DSTATE_SAMPLER_STATE_POINTERS_* commands.2097*/2098unsigned size = count * 4 * GENX(SAMPLER_STATE_length);2099uint32_t *map =2100upload_state(ice->state.dynamic_uploader, &shs->sampler_table, size, 32);2101if (unlikely(!map))2102return;21032104struct pipe_resource *res = shs->sampler_table.res;2105struct iris_bo *bo = iris_resource_bo(res);21062107iris_record_state_size(ice->state.sizes,2108bo->gtt_offset + shs->sampler_table.offset, size);21092110shs->sampler_table.offset += iris_bo_offset_from_base_address(bo);21112112/* Make sure all land in the same BO */2113iris_border_color_pool_reserve(ice, IRIS_MAX_TEXTURE_SAMPLERS);21142115ice->state.need_border_colors &= ~(1 << stage);21162117for (int i = 0; i < count; i++) {2118struct iris_sampler_state *state = shs->samplers[i];2119struct iris_sampler_view *tex = shs->textures[i];21202121if (!state) {2122memset(map, 0, 4 * GENX(SAMPLER_STATE_length));2123} else if (!state->needs_border_color) {2124memcpy(map, state->sampler_state, 4 * GENX(SAMPLER_STATE_length));2125} else {2126ice->state.need_border_colors |= 1 << stage;21272128/* We may need to swizzle the border color for format faking.2129* A/LA formats are faked as R/RG with 000R or R00G swizzles.2130* This means we need to move the border color's A channel into2131* the R or G channels so that those read swizzles will move it2132* back into A.2133*/2134union pipe_color_union *color = &state->border_color;2135union pipe_color_union tmp;2136if (tex) {2137enum pipe_format internal_format = tex->res->internal_format;21382139if (util_format_is_alpha(internal_format)) {2140unsigned char swz[4] = {2141PIPE_SWIZZLE_W, PIPE_SWIZZLE_0,2142PIPE_SWIZZLE_0, PIPE_SWIZZLE_02143};2144util_format_apply_color_swizzle(&tmp, color, swz, true);2145color = &tmp;2146} else if (util_format_is_luminance_alpha(internal_format) &&2147internal_format != PIPE_FORMAT_L8A8_SRGB) {2148unsigned char swz[4] = {2149PIPE_SWIZZLE_X, PIPE_SWIZZLE_W,2150PIPE_SWIZZLE_0, PIPE_SWIZZLE_02151};2152util_format_apply_color_swizzle(&tmp, color, swz, true);2153color = &tmp;2154}2155}21562157/* Stream out the border color and merge the pointer. */2158uint32_t offset = iris_upload_border_color(ice, color);21592160uint32_t dynamic[GENX(SAMPLER_STATE_length)];2161iris_pack_state(GENX(SAMPLER_STATE), dynamic, dyns) {2162dyns.BorderColorPointer = offset;2163}21642165for (uint32_t j = 0; j < GENX(SAMPLER_STATE_length); j++)2166map[j] = state->sampler_state[j] | dynamic[j];2167}21682169map += GENX(SAMPLER_STATE_length);2170}2171}21722173static enum isl_channel_select2174fmt_swizzle(const struct iris_format_info *fmt, enum pipe_swizzle swz)2175{2176switch (swz) {2177case PIPE_SWIZZLE_X: return fmt->swizzle.r;2178case PIPE_SWIZZLE_Y: return fmt->swizzle.g;2179case PIPE_SWIZZLE_Z: return fmt->swizzle.b;2180case PIPE_SWIZZLE_W: return fmt->swizzle.a;2181case PIPE_SWIZZLE_1: return ISL_CHANNEL_SELECT_ONE;2182case PIPE_SWIZZLE_0: return ISL_CHANNEL_SELECT_ZERO;2183default: unreachable("invalid swizzle");2184}2185}21862187static void2188fill_buffer_surface_state(struct isl_device *isl_dev,2189struct iris_resource *res,2190void *map,2191enum isl_format format,2192struct isl_swizzle swizzle,2193unsigned offset,2194unsigned size,2195isl_surf_usage_flags_t usage)2196{2197const struct isl_format_layout *fmtl = isl_format_get_layout(format);2198const unsigned cpp = format == ISL_FORMAT_RAW ? 1 : fmtl->bpb / 8;21992200/* The ARB_texture_buffer_specification says:2201*2202* "The number of texels in the buffer texture's texel array is given by2203*2204* floor(<buffer_size> / (<components> * sizeof(<base_type>)),2205*2206* where <buffer_size> is the size of the buffer object, in basic2207* machine units and <components> and <base_type> are the element count2208* and base data type for elements, as specified in Table X.1. The2209* number of texels in the texel array is then clamped to the2210* implementation-dependent limit MAX_TEXTURE_BUFFER_SIZE_ARB."2211*2212* We need to clamp the size in bytes to MAX_TEXTURE_BUFFER_SIZE * stride,2213* so that when ISL divides by stride to obtain the number of texels, that2214* texel count is clamped to MAX_TEXTURE_BUFFER_SIZE.2215*/2216unsigned final_size =2217MIN3(size, res->bo->size - res->offset - offset,2218IRIS_MAX_TEXTURE_BUFFER_SIZE * cpp);22192220isl_buffer_fill_state(isl_dev, map,2221.address = res->bo->gtt_offset + res->offset + offset,2222.size_B = final_size,2223.format = format,2224.swizzle = swizzle,2225.stride_B = cpp,2226.mocs = iris_mocs(res->bo, isl_dev, usage));2227}22282229#define SURFACE_STATE_ALIGNMENT 6422302231/**2232* Allocate several contiguous SURFACE_STATE structures, one for each2233* supported auxiliary surface mode. This only allocates the CPU-side2234* copy, they will need to be uploaded later after they're filled in.2235*/2236static void2237alloc_surface_states(struct iris_surface_state *surf_state,2238unsigned aux_usages)2239{2240const unsigned surf_size = 4 * GENX(RENDER_SURFACE_STATE_length);22412242/* If this changes, update this to explicitly align pointers */2243STATIC_ASSERT(surf_size == SURFACE_STATE_ALIGNMENT);22442245assert(aux_usages != 0);22462247/* In case we're re-allocating them... */2248free(surf_state->cpu);22492250surf_state->num_states = util_bitcount(aux_usages);2251surf_state->cpu = calloc(surf_state->num_states, surf_size);2252surf_state->ref.offset = 0;2253pipe_resource_reference(&surf_state->ref.res, NULL);22542255assert(surf_state->cpu);2256}22572258/**2259* Upload the CPU side SURFACE_STATEs into a GPU buffer.2260*/2261static void2262upload_surface_states(struct u_upload_mgr *mgr,2263struct iris_surface_state *surf_state)2264{2265const unsigned surf_size = 4 * GENX(RENDER_SURFACE_STATE_length);2266const unsigned bytes = surf_state->num_states * surf_size;22672268void *map =2269upload_state(mgr, &surf_state->ref, bytes, SURFACE_STATE_ALIGNMENT);22702271surf_state->ref.offset +=2272iris_bo_offset_from_base_address(iris_resource_bo(surf_state->ref.res));22732274if (map)2275memcpy(map, surf_state->cpu, bytes);2276}22772278/**2279* Update resource addresses in a set of SURFACE_STATE descriptors,2280* and re-upload them if necessary.2281*/2282static bool2283update_surface_state_addrs(struct u_upload_mgr *mgr,2284struct iris_surface_state *surf_state,2285struct iris_bo *bo)2286{2287if (surf_state->bo_address == bo->gtt_offset)2288return false;22892290STATIC_ASSERT(GENX(RENDER_SURFACE_STATE_SurfaceBaseAddress_start) % 64 == 0);2291STATIC_ASSERT(GENX(RENDER_SURFACE_STATE_SurfaceBaseAddress_bits) == 64);22922293uint64_t *ss_addr = (uint64_t *) &surf_state->cpu[GENX(RENDER_SURFACE_STATE_SurfaceBaseAddress_start) / 32];22942295/* First, update the CPU copies. We assume no other fields exist in2296* the QWord containing Surface Base Address.2297*/2298for (unsigned i = 0; i < surf_state->num_states; i++) {2299*ss_addr = *ss_addr - surf_state->bo_address + bo->gtt_offset;2300ss_addr = ((void *) ss_addr) + SURFACE_STATE_ALIGNMENT;2301}23022303/* Next, upload the updated copies to a GPU buffer. */2304upload_surface_states(mgr, surf_state);23052306surf_state->bo_address = bo->gtt_offset;23072308return true;2309}23102311static void2312fill_surface_state(struct isl_device *isl_dev,2313void *map,2314struct iris_resource *res,2315struct isl_surf *surf,2316struct isl_view *view,2317unsigned aux_usage,2318uint32_t extra_main_offset,2319uint32_t tile_x_sa,2320uint32_t tile_y_sa)2321{2322struct isl_surf_fill_state_info f = {2323.surf = surf,2324.view = view,2325.mocs = iris_mocs(res->bo, isl_dev, view->usage),2326.address = res->bo->gtt_offset + res->offset + extra_main_offset,2327.x_offset_sa = tile_x_sa,2328.y_offset_sa = tile_y_sa,2329};23302331assert(!iris_resource_unfinished_aux_import(res));23322333if (aux_usage != ISL_AUX_USAGE_NONE) {2334f.aux_surf = &res->aux.surf;2335f.aux_usage = aux_usage;2336f.aux_address = res->aux.bo->gtt_offset + res->aux.offset;23372338struct iris_bo *clear_bo = NULL;2339uint64_t clear_offset = 0;2340f.clear_color =2341iris_resource_get_clear_color(res, &clear_bo, &clear_offset);2342if (clear_bo) {2343f.clear_address = clear_bo->gtt_offset + clear_offset;2344f.use_clear_address = isl_dev->info->ver > 9;2345}2346}23472348isl_surf_fill_state_s(isl_dev, map, &f);2349}23502351/**2352* The pipe->create_sampler_view() driver hook.2353*/2354static struct pipe_sampler_view *2355iris_create_sampler_view(struct pipe_context *ctx,2356struct pipe_resource *tex,2357const struct pipe_sampler_view *tmpl)2358{2359struct iris_screen *screen = (struct iris_screen *)ctx->screen;2360const struct intel_device_info *devinfo = &screen->devinfo;2361struct iris_sampler_view *isv = calloc(1, sizeof(struct iris_sampler_view));23622363if (!isv)2364return NULL;23652366/* initialize base object */2367isv->base = *tmpl;2368isv->base.context = ctx;2369isv->base.texture = NULL;2370pipe_reference_init(&isv->base.reference, 1);2371pipe_resource_reference(&isv->base.texture, tex);23722373if (util_format_is_depth_or_stencil(tmpl->format)) {2374struct iris_resource *zres, *sres;2375const struct util_format_description *desc =2376util_format_description(tmpl->format);23772378iris_get_depth_stencil_resources(tex, &zres, &sres);23792380tex = util_format_has_depth(desc) ? &zres->base.b : &sres->base.b;2381}23822383isv->res = (struct iris_resource *) tex;23842385alloc_surface_states(&isv->surface_state, isv->res->aux.sampler_usages);23862387isv->surface_state.bo_address = isv->res->bo->gtt_offset;23882389isl_surf_usage_flags_t usage = ISL_SURF_USAGE_TEXTURE_BIT;23902391if (isv->base.target == PIPE_TEXTURE_CUBE ||2392isv->base.target == PIPE_TEXTURE_CUBE_ARRAY)2393usage |= ISL_SURF_USAGE_CUBE_BIT;23942395const struct iris_format_info fmt =2396iris_format_for_usage(devinfo, tmpl->format, usage);23972398isv->clear_color = isv->res->aux.clear_color;23992400isv->view = (struct isl_view) {2401.format = fmt.fmt,2402.swizzle = (struct isl_swizzle) {2403.r = fmt_swizzle(&fmt, tmpl->swizzle_r),2404.g = fmt_swizzle(&fmt, tmpl->swizzle_g),2405.b = fmt_swizzle(&fmt, tmpl->swizzle_b),2406.a = fmt_swizzle(&fmt, tmpl->swizzle_a),2407},2408.usage = usage,2409};24102411void *map = isv->surface_state.cpu;24122413/* Fill out SURFACE_STATE for this view. */2414if (tmpl->target != PIPE_BUFFER) {2415isv->view.base_level = tmpl->u.tex.first_level;2416isv->view.levels = tmpl->u.tex.last_level - tmpl->u.tex.first_level + 1;2417// XXX: do I need to port f9fd0cf4790cb2a530e75d1a2206dbb9d8af7cb2?2418isv->view.base_array_layer = tmpl->u.tex.first_layer;2419isv->view.array_len =2420tmpl->u.tex.last_layer - tmpl->u.tex.first_layer + 1;24212422if (iris_resource_unfinished_aux_import(isv->res))2423iris_resource_finish_aux_import(&screen->base, isv->res);24242425unsigned aux_modes = isv->res->aux.sampler_usages;2426while (aux_modes) {2427enum isl_aux_usage aux_usage = u_bit_scan(&aux_modes);24282429fill_surface_state(&screen->isl_dev, map, isv->res, &isv->res->surf,2430&isv->view, aux_usage, 0, 0, 0);24312432map += SURFACE_STATE_ALIGNMENT;2433}2434} else {2435fill_buffer_surface_state(&screen->isl_dev, isv->res, map,2436isv->view.format, isv->view.swizzle,2437tmpl->u.buf.offset, tmpl->u.buf.size,2438ISL_SURF_USAGE_TEXTURE_BIT);2439}24402441return &isv->base;2442}24432444static void2445iris_sampler_view_destroy(struct pipe_context *ctx,2446struct pipe_sampler_view *state)2447{2448struct iris_sampler_view *isv = (void *) state;2449pipe_resource_reference(&state->texture, NULL);2450pipe_resource_reference(&isv->surface_state.ref.res, NULL);2451free(isv->surface_state.cpu);2452free(isv);2453}24542455/**2456* The pipe->create_surface() driver hook.2457*2458* In Gallium nomenclature, "surfaces" are a view of a resource that2459* can be bound as a render target or depth/stencil buffer.2460*/2461static struct pipe_surface *2462iris_create_surface(struct pipe_context *ctx,2463struct pipe_resource *tex,2464const struct pipe_surface *tmpl)2465{2466struct iris_screen *screen = (struct iris_screen *)ctx->screen;2467const struct intel_device_info *devinfo = &screen->devinfo;24682469isl_surf_usage_flags_t usage = 0;2470if (tmpl->writable)2471usage = ISL_SURF_USAGE_STORAGE_BIT;2472else if (util_format_is_depth_or_stencil(tmpl->format))2473usage = ISL_SURF_USAGE_DEPTH_BIT;2474else2475usage = ISL_SURF_USAGE_RENDER_TARGET_BIT;24762477const struct iris_format_info fmt =2478iris_format_for_usage(devinfo, tmpl->format, usage);24792480if ((usage & ISL_SURF_USAGE_RENDER_TARGET_BIT) &&2481!isl_format_supports_rendering(devinfo, fmt.fmt)) {2482/* Framebuffer validation will reject this invalid case, but it2483* hasn't had the opportunity yet. In the meantime, we need to2484* avoid hitting ISL asserts about unsupported formats below.2485*/2486return NULL;2487}24882489struct iris_surface *surf = calloc(1, sizeof(struct iris_surface));2490struct pipe_surface *psurf = &surf->base;2491struct iris_resource *res = (struct iris_resource *) tex;24922493if (!surf)2494return NULL;24952496pipe_reference_init(&psurf->reference, 1);2497pipe_resource_reference(&psurf->texture, tex);2498psurf->context = ctx;2499psurf->format = tmpl->format;2500psurf->width = tex->width0;2501psurf->height = tex->height0;2502psurf->texture = tex;2503psurf->u.tex.first_layer = tmpl->u.tex.first_layer;2504psurf->u.tex.last_layer = tmpl->u.tex.last_layer;2505psurf->u.tex.level = tmpl->u.tex.level;25062507uint32_t array_len = tmpl->u.tex.last_layer - tmpl->u.tex.first_layer + 1;25082509struct isl_view *view = &surf->view;2510*view = (struct isl_view) {2511.format = fmt.fmt,2512.base_level = tmpl->u.tex.level,2513.levels = 1,2514.base_array_layer = tmpl->u.tex.first_layer,2515.array_len = array_len,2516.swizzle = ISL_SWIZZLE_IDENTITY,2517.usage = usage,2518};25192520#if GFX_VER == 82521struct isl_view *read_view = &surf->read_view;2522*read_view = (struct isl_view) {2523.format = fmt.fmt,2524.base_level = tmpl->u.tex.level,2525.levels = 1,2526.base_array_layer = tmpl->u.tex.first_layer,2527.array_len = array_len,2528.swizzle = ISL_SWIZZLE_IDENTITY,2529.usage = ISL_SURF_USAGE_TEXTURE_BIT,2530};25312532struct isl_surf read_surf = res->surf;2533uint32_t read_surf_offset_B = 0;2534uint32_t read_surf_tile_x_sa = 0, read_surf_tile_y_sa = 0;2535if (tex->target == PIPE_TEXTURE_3D && array_len == 1) {2536/* The minimum array element field of the surface state structure is2537* ignored by the sampler unit for 3D textures on some hardware. If the2538* render buffer is a single slice of a 3D texture, create a 2D texture2539* covering that slice.2540*2541* TODO: This only handles the case where we're rendering to a single2542* slice of an array texture. If we have layered rendering combined2543* with non-coherent FB fetch and a non-zero base_array_layer, then2544* we're going to run into problems.2545*2546* See https://gitlab.freedesktop.org/mesa/mesa/-/issues/49042547*/2548isl_surf_get_image_surf(&screen->isl_dev, &res->surf,2549read_view->base_level,25500, read_view->base_array_layer,2551&read_surf, &read_surf_offset_B,2552&read_surf_tile_x_sa, &read_surf_tile_y_sa);2553read_view->base_level = 0;2554read_view->base_array_layer = 0;2555assert(read_view->array_len == 1);2556} else if (tex->target == PIPE_TEXTURE_1D_ARRAY) {2557/* Convert 1D array textures to 2D arrays because shaders always provide2558* the array index coordinate at the Z component to avoid recompiles2559* when changing the texture target of the framebuffer.2560*/2561assert(read_surf.dim_layout == ISL_DIM_LAYOUT_GFX4_2D);2562read_surf.dim = ISL_SURF_DIM_2D;2563}2564#endif25652566surf->clear_color = res->aux.clear_color;25672568/* Bail early for depth/stencil - we don't want SURFACE_STATE for them. */2569if (res->surf.usage & (ISL_SURF_USAGE_DEPTH_BIT |2570ISL_SURF_USAGE_STENCIL_BIT))2571return psurf;257225732574alloc_surface_states(&surf->surface_state, res->aux.possible_usages);2575surf->surface_state.bo_address = res->bo->gtt_offset;25762577#if GFX_VER == 82578alloc_surface_states(&surf->surface_state_read, res->aux.possible_usages);2579surf->surface_state_read.bo_address = res->bo->gtt_offset;2580#endif25812582if (!isl_format_is_compressed(res->surf.format)) {2583if (iris_resource_unfinished_aux_import(res))2584iris_resource_finish_aux_import(&screen->base, res);25852586void *map = surf->surface_state.cpu;2587UNUSED void *map_read = surf->surface_state_read.cpu;25882589/* This is a normal surface. Fill out a SURFACE_STATE for each possible2590* auxiliary surface mode and return the pipe_surface.2591*/2592unsigned aux_modes = res->aux.possible_usages;2593while (aux_modes) {2594enum isl_aux_usage aux_usage = u_bit_scan(&aux_modes);2595fill_surface_state(&screen->isl_dev, map, res, &res->surf,2596view, aux_usage, 0, 0, 0);2597map += SURFACE_STATE_ALIGNMENT;25982599#if GFX_VER == 82600fill_surface_state(&screen->isl_dev, map_read, res,2601&read_surf, read_view, aux_usage,2602read_surf_offset_B,2603read_surf_tile_x_sa, read_surf_tile_y_sa);2604map_read += SURFACE_STATE_ALIGNMENT;2605#endif2606}26072608return psurf;2609}26102611/* The resource has a compressed format, which is not renderable, but we2612* have a renderable view format. We must be attempting to upload blocks2613* of compressed data via an uncompressed view.2614*2615* In this case, we can assume there are no auxiliary buffers, a single2616* miplevel, and that the resource is single-sampled. Gallium may try2617* and create an uncompressed view with multiple layers, however.2618*/2619assert(!isl_format_is_compressed(fmt.fmt));2620assert(res->aux.possible_usages == 1 << ISL_AUX_USAGE_NONE);2621assert(res->surf.samples == 1);2622assert(view->levels == 1);26232624struct isl_surf isl_surf;2625uint32_t offset_B = 0, tile_x_el = 0, tile_y_el = 0;2626bool ok = isl_surf_get_uncompressed_surf(&screen->isl_dev, &res->surf,2627view, &isl_surf, view,2628&offset_B, &tile_x_el, &tile_y_el);2629if (!ok) {2630free(surf);2631return NULL;2632}26332634psurf->width = isl_surf.logical_level0_px.width;2635psurf->height = isl_surf.logical_level0_px.height;26362637struct isl_surf_fill_state_info f = {2638.surf = &isl_surf,2639.view = view,2640.mocs = iris_mocs(res->bo, &screen->isl_dev,2641ISL_SURF_USAGE_RENDER_TARGET_BIT),2642.address = res->bo->gtt_offset + offset_B,2643.x_offset_sa = tile_x_el, /* Single-sampled, so el == sa */2644.y_offset_sa = tile_y_el, /* Single-sampled, so el == sa */2645};26462647isl_surf_fill_state_s(&screen->isl_dev, surf->surface_state.cpu, &f);26482649return psurf;2650}26512652#if GFX_VER < 92653static void2654fill_default_image_param(struct brw_image_param *param)2655{2656memset(param, 0, sizeof(*param));2657/* Set the swizzling shifts to all-ones to effectively disable swizzling --2658* See emit_address_calculation() in brw_fs_surface_builder.cpp for a more2659* detailed explanation of these parameters.2660*/2661param->swizzling[0] = 0xff;2662param->swizzling[1] = 0xff;2663}26642665static void2666fill_buffer_image_param(struct brw_image_param *param,2667enum pipe_format pfmt,2668unsigned size)2669{2670const unsigned cpp = util_format_get_blocksize(pfmt);26712672fill_default_image_param(param);2673param->size[0] = size / cpp;2674param->stride[0] = cpp;2675}2676#else2677#define isl_surf_fill_image_param(x, ...)2678#define fill_default_image_param(x, ...)2679#define fill_buffer_image_param(x, ...)2680#endif26812682/**2683* The pipe->set_shader_images() driver hook.2684*/2685static void2686iris_set_shader_images(struct pipe_context *ctx,2687enum pipe_shader_type p_stage,2688unsigned start_slot, unsigned count,2689unsigned unbind_num_trailing_slots,2690const struct pipe_image_view *p_images)2691{2692struct iris_context *ice = (struct iris_context *) ctx;2693struct iris_screen *screen = (struct iris_screen *)ctx->screen;2694gl_shader_stage stage = stage_from_pipe(p_stage);2695struct iris_shader_state *shs = &ice->state.shaders[stage];2696#if GFX_VER == 82697struct iris_genx_state *genx = ice->state.genx;2698struct brw_image_param *image_params = genx->shaders[stage].image_param;2699#endif27002701shs->bound_image_views &=2702~u_bit_consecutive(start_slot, count + unbind_num_trailing_slots);27032704for (unsigned i = 0; i < count; i++) {2705struct iris_image_view *iv = &shs->image[start_slot + i];27062707if (p_images && p_images[i].resource) {2708const struct pipe_image_view *img = &p_images[i];2709struct iris_resource *res = (void *) img->resource;27102711util_copy_image_view(&iv->base, img);27122713shs->bound_image_views |= 1 << (start_slot + i);27142715res->bind_history |= PIPE_BIND_SHADER_IMAGE;2716res->bind_stages |= 1 << stage;27172718enum isl_format isl_fmt = iris_image_view_get_format(ice, img);27192720/* Render compression with images supported on gfx12+ only. */2721unsigned aux_usages = GFX_VER >= 12 ? res->aux.possible_usages :27221 << ISL_AUX_USAGE_NONE;27232724alloc_surface_states(&iv->surface_state, aux_usages);2725iv->surface_state.bo_address = res->bo->gtt_offset;27262727void *map = iv->surface_state.cpu;27282729if (res->base.b.target != PIPE_BUFFER) {2730struct isl_view view = {2731.format = isl_fmt,2732.base_level = img->u.tex.level,2733.levels = 1,2734.base_array_layer = img->u.tex.first_layer,2735.array_len = img->u.tex.last_layer - img->u.tex.first_layer + 1,2736.swizzle = ISL_SWIZZLE_IDENTITY,2737.usage = ISL_SURF_USAGE_STORAGE_BIT,2738};27392740/* If using untyped fallback. */2741if (isl_fmt == ISL_FORMAT_RAW) {2742fill_buffer_surface_state(&screen->isl_dev, res, map,2743isl_fmt, ISL_SWIZZLE_IDENTITY,27440, res->bo->size,2745ISL_SURF_USAGE_STORAGE_BIT);2746} else {2747unsigned aux_modes = aux_usages;2748while (aux_modes) {2749enum isl_aux_usage usage = u_bit_scan(&aux_modes);27502751fill_surface_state(&screen->isl_dev, map, res, &res->surf,2752&view, usage, 0, 0, 0);27532754map += SURFACE_STATE_ALIGNMENT;2755}2756}27572758isl_surf_fill_image_param(&screen->isl_dev,2759&image_params[start_slot + i],2760&res->surf, &view);2761} else {2762util_range_add(&res->base.b, &res->valid_buffer_range, img->u.buf.offset,2763img->u.buf.offset + img->u.buf.size);27642765fill_buffer_surface_state(&screen->isl_dev, res, map,2766isl_fmt, ISL_SWIZZLE_IDENTITY,2767img->u.buf.offset, img->u.buf.size,2768ISL_SURF_USAGE_STORAGE_BIT);2769fill_buffer_image_param(&image_params[start_slot + i],2770img->format, img->u.buf.size);2771}27722773upload_surface_states(ice->state.surface_uploader, &iv->surface_state);2774} else {2775pipe_resource_reference(&iv->base.resource, NULL);2776pipe_resource_reference(&iv->surface_state.ref.res, NULL);2777fill_default_image_param(&image_params[start_slot + i]);2778}2779}27802781ice->state.stage_dirty |= IRIS_STAGE_DIRTY_BINDINGS_VS << stage;2782ice->state.dirty |=2783stage == MESA_SHADER_COMPUTE ? IRIS_DIRTY_COMPUTE_RESOLVES_AND_FLUSHES2784: IRIS_DIRTY_RENDER_RESOLVES_AND_FLUSHES;27852786/* Broadwell also needs brw_image_params re-uploaded */2787if (GFX_VER < 9) {2788ice->state.stage_dirty |= IRIS_STAGE_DIRTY_CONSTANTS_VS << stage;2789shs->sysvals_need_upload = true;2790}27912792if (unbind_num_trailing_slots) {2793iris_set_shader_images(ctx, p_stage, start_slot + count,2794unbind_num_trailing_slots, 0, NULL);2795}2796}279727982799/**2800* The pipe->set_sampler_views() driver hook.2801*/2802static void2803iris_set_sampler_views(struct pipe_context *ctx,2804enum pipe_shader_type p_stage,2805unsigned start, unsigned count,2806unsigned unbind_num_trailing_slots,2807struct pipe_sampler_view **views)2808{2809struct iris_context *ice = (struct iris_context *) ctx;2810gl_shader_stage stage = stage_from_pipe(p_stage);2811struct iris_shader_state *shs = &ice->state.shaders[stage];2812unsigned i;28132814shs->bound_sampler_views &=2815~u_bit_consecutive(start, count + unbind_num_trailing_slots);28162817for (i = 0; i < count; i++) {2818struct pipe_sampler_view *pview = views ? views[i] : NULL;2819pipe_sampler_view_reference((struct pipe_sampler_view **)2820&shs->textures[start + i], pview);2821struct iris_sampler_view *view = (void *) pview;2822if (view) {2823view->res->bind_history |= PIPE_BIND_SAMPLER_VIEW;2824view->res->bind_stages |= 1 << stage;28252826shs->bound_sampler_views |= 1 << (start + i);28272828update_surface_state_addrs(ice->state.surface_uploader,2829&view->surface_state, view->res->bo);2830}2831}2832for (; i < count + unbind_num_trailing_slots; i++) {2833pipe_sampler_view_reference((struct pipe_sampler_view **)2834&shs->textures[start + i], NULL);2835}28362837ice->state.stage_dirty |= (IRIS_STAGE_DIRTY_BINDINGS_VS << stage);2838ice->state.dirty |=2839stage == MESA_SHADER_COMPUTE ? IRIS_DIRTY_COMPUTE_RESOLVES_AND_FLUSHES2840: IRIS_DIRTY_RENDER_RESOLVES_AND_FLUSHES;2841}28422843static void2844iris_set_compute_resources(struct pipe_context *ctx,2845unsigned start, unsigned count,2846struct pipe_surface **resources)2847{2848assert(count == 0);2849}28502851static void2852iris_set_global_binding(struct pipe_context *ctx,2853unsigned start_slot, unsigned count,2854struct pipe_resource **resources,2855uint32_t **handles)2856{2857struct iris_context *ice = (struct iris_context *) ctx;28582859assert(start_slot + count <= IRIS_MAX_GLOBAL_BINDINGS);2860for (unsigned i = 0; i < count; i++) {2861if (resources && resources[i]) {2862pipe_resource_reference(&ice->state.global_bindings[start_slot + i],2863resources[i]);2864struct iris_resource *res = (void *) resources[i];2865uint64_t addr = res->bo->gtt_offset;2866memcpy(handles[i], &addr, sizeof(addr));2867} else {2868pipe_resource_reference(&ice->state.global_bindings[start_slot + i],2869NULL);2870}2871}28722873ice->state.stage_dirty |= IRIS_STAGE_DIRTY_BINDINGS_CS;2874}28752876/**2877* The pipe->set_tess_state() driver hook.2878*/2879static void2880iris_set_tess_state(struct pipe_context *ctx,2881const float default_outer_level[4],2882const float default_inner_level[2])2883{2884struct iris_context *ice = (struct iris_context *) ctx;2885struct iris_shader_state *shs = &ice->state.shaders[MESA_SHADER_TESS_CTRL];28862887memcpy(&ice->state.default_outer_level[0], &default_outer_level[0], 4 * sizeof(float));2888memcpy(&ice->state.default_inner_level[0], &default_inner_level[0], 2 * sizeof(float));28892890ice->state.stage_dirty |= IRIS_STAGE_DIRTY_CONSTANTS_TCS;2891shs->sysvals_need_upload = true;2892}28932894static void2895iris_surface_destroy(struct pipe_context *ctx, struct pipe_surface *p_surf)2896{2897struct iris_surface *surf = (void *) p_surf;2898pipe_resource_reference(&p_surf->texture, NULL);2899pipe_resource_reference(&surf->surface_state.ref.res, NULL);2900pipe_resource_reference(&surf->surface_state_read.ref.res, NULL);2901free(surf->surface_state.cpu);2902free(surf);2903}29042905static void2906iris_set_clip_state(struct pipe_context *ctx,2907const struct pipe_clip_state *state)2908{2909struct iris_context *ice = (struct iris_context *) ctx;2910struct iris_shader_state *shs = &ice->state.shaders[MESA_SHADER_VERTEX];2911struct iris_shader_state *gshs = &ice->state.shaders[MESA_SHADER_GEOMETRY];2912struct iris_shader_state *tshs = &ice->state.shaders[MESA_SHADER_TESS_EVAL];29132914memcpy(&ice->state.clip_planes, state, sizeof(*state));29152916ice->state.stage_dirty |= IRIS_STAGE_DIRTY_CONSTANTS_VS |2917IRIS_STAGE_DIRTY_CONSTANTS_GS |2918IRIS_STAGE_DIRTY_CONSTANTS_TES;2919shs->sysvals_need_upload = true;2920gshs->sysvals_need_upload = true;2921tshs->sysvals_need_upload = true;2922}29232924/**2925* The pipe->set_polygon_stipple() driver hook.2926*/2927static void2928iris_set_polygon_stipple(struct pipe_context *ctx,2929const struct pipe_poly_stipple *state)2930{2931struct iris_context *ice = (struct iris_context *) ctx;2932memcpy(&ice->state.poly_stipple, state, sizeof(*state));2933ice->state.dirty |= IRIS_DIRTY_POLYGON_STIPPLE;2934}29352936/**2937* The pipe->set_sample_mask() driver hook.2938*/2939static void2940iris_set_sample_mask(struct pipe_context *ctx, unsigned sample_mask)2941{2942struct iris_context *ice = (struct iris_context *) ctx;29432944/* We only support 16x MSAA, so we have 16 bits of sample maks.2945* st/mesa may pass us 0xffffffff though, meaning "enable all samples".2946*/2947ice->state.sample_mask = sample_mask & 0xffff;2948ice->state.dirty |= IRIS_DIRTY_SAMPLE_MASK;2949}29502951/**2952* The pipe->set_scissor_states() driver hook.2953*2954* This corresponds to our SCISSOR_RECT state structures. It's an2955* exact match, so we just store them, and memcpy them out later.2956*/2957static void2958iris_set_scissor_states(struct pipe_context *ctx,2959unsigned start_slot,2960unsigned num_scissors,2961const struct pipe_scissor_state *rects)2962{2963struct iris_context *ice = (struct iris_context *) ctx;29642965for (unsigned i = 0; i < num_scissors; i++) {2966if (rects[i].minx == rects[i].maxx || rects[i].miny == rects[i].maxy) {2967/* If the scissor was out of bounds and got clamped to 0 width/height2968* at the bounds, the subtraction of 1 from maximums could produce a2969* negative number and thus not clip anything. Instead, just provide2970* a min > max scissor inside the bounds, which produces the expected2971* no rendering.2972*/2973ice->state.scissors[start_slot + i] = (struct pipe_scissor_state) {2974.minx = 1, .maxx = 0, .miny = 1, .maxy = 0,2975};2976} else {2977ice->state.scissors[start_slot + i] = (struct pipe_scissor_state) {2978.minx = rects[i].minx, .miny = rects[i].miny,2979.maxx = rects[i].maxx - 1, .maxy = rects[i].maxy - 1,2980};2981}2982}29832984ice->state.dirty |= IRIS_DIRTY_SCISSOR_RECT;2985}29862987/**2988* The pipe->set_stencil_ref() driver hook.2989*2990* This is added to 3DSTATE_WM_DEPTH_STENCIL dynamically at draw time.2991*/2992static void2993iris_set_stencil_ref(struct pipe_context *ctx,2994const struct pipe_stencil_ref state)2995{2996struct iris_context *ice = (struct iris_context *) ctx;2997memcpy(&ice->state.stencil_ref, &state, sizeof(state));2998if (GFX_VER >= 12)2999ice->state.dirty |= IRIS_DIRTY_STENCIL_REF;3000else if (GFX_VER >= 9)3001ice->state.dirty |= IRIS_DIRTY_WM_DEPTH_STENCIL;3002else3003ice->state.dirty |= IRIS_DIRTY_COLOR_CALC_STATE;3004}30053006static float3007viewport_extent(const struct pipe_viewport_state *state, int axis, float sign)3008{3009return copysignf(state->scale[axis], sign) + state->translate[axis];3010}30113012/**3013* The pipe->set_viewport_states() driver hook.3014*3015* This corresponds to our SF_CLIP_VIEWPORT states. We can't calculate3016* the guardband yet, as we need the framebuffer dimensions, but we can3017* at least fill out the rest.3018*/3019static void3020iris_set_viewport_states(struct pipe_context *ctx,3021unsigned start_slot,3022unsigned count,3023const struct pipe_viewport_state *states)3024{3025struct iris_context *ice = (struct iris_context *) ctx;30263027memcpy(&ice->state.viewports[start_slot], states, sizeof(*states) * count);30283029ice->state.dirty |= IRIS_DIRTY_SF_CL_VIEWPORT;30303031if (ice->state.cso_rast && (!ice->state.cso_rast->depth_clip_near ||3032!ice->state.cso_rast->depth_clip_far))3033ice->state.dirty |= IRIS_DIRTY_CC_VIEWPORT;3034}30353036/**3037* The pipe->set_framebuffer_state() driver hook.3038*3039* Sets the current draw FBO, including color render targets, depth,3040* and stencil buffers.3041*/3042static void3043iris_set_framebuffer_state(struct pipe_context *ctx,3044const struct pipe_framebuffer_state *state)3045{3046struct iris_context *ice = (struct iris_context *) ctx;3047struct iris_screen *screen = (struct iris_screen *)ctx->screen;3048struct isl_device *isl_dev = &screen->isl_dev;3049struct pipe_framebuffer_state *cso = &ice->state.framebuffer;3050struct iris_resource *zres;3051struct iris_resource *stencil_res;30523053unsigned samples = util_framebuffer_get_num_samples(state);3054unsigned layers = util_framebuffer_get_num_layers(state);30553056if (cso->samples != samples) {3057ice->state.dirty |= IRIS_DIRTY_MULTISAMPLE;30583059/* We need to toggle 3DSTATE_PS::32 Pixel Dispatch Enable */3060if (GFX_VER >= 9 && (cso->samples == 16 || samples == 16))3061ice->state.stage_dirty |= IRIS_STAGE_DIRTY_FS;3062}30633064if (cso->nr_cbufs != state->nr_cbufs) {3065ice->state.dirty |= IRIS_DIRTY_BLEND_STATE;3066}30673068if ((cso->layers == 0) != (layers == 0)) {3069ice->state.dirty |= IRIS_DIRTY_CLIP;3070}30713072if (cso->width != state->width || cso->height != state->height) {3073ice->state.dirty |= IRIS_DIRTY_SF_CL_VIEWPORT;3074}30753076if (cso->zsbuf || state->zsbuf) {3077ice->state.dirty |= IRIS_DIRTY_DEPTH_BUFFER;3078}30793080util_copy_framebuffer_state(cso, state);3081cso->samples = samples;3082cso->layers = layers;30833084struct iris_depth_buffer_state *cso_z = &ice->state.genx->depth_buffer;30853086struct isl_view view = {3087.base_level = 0,3088.levels = 1,3089.base_array_layer = 0,3090.array_len = 1,3091.swizzle = ISL_SWIZZLE_IDENTITY,3092};30933094struct isl_depth_stencil_hiz_emit_info info = { .view = &view };30953096if (cso->zsbuf) {3097iris_get_depth_stencil_resources(cso->zsbuf->texture, &zres,3098&stencil_res);30993100view.base_level = cso->zsbuf->u.tex.level;3101view.base_array_layer = cso->zsbuf->u.tex.first_layer;3102view.array_len =3103cso->zsbuf->u.tex.last_layer - cso->zsbuf->u.tex.first_layer + 1;31043105if (zres) {3106view.usage |= ISL_SURF_USAGE_DEPTH_BIT;31073108info.depth_surf = &zres->surf;3109info.depth_address = zres->bo->gtt_offset + zres->offset;3110info.mocs = iris_mocs(zres->bo, isl_dev, view.usage);31113112view.format = zres->surf.format;31133114if (iris_resource_level_has_hiz(zres, view.base_level)) {3115info.hiz_usage = zres->aux.usage;3116info.hiz_surf = &zres->aux.surf;3117info.hiz_address = zres->aux.bo->gtt_offset + zres->aux.offset;3118}31193120ice->state.hiz_usage = info.hiz_usage;3121}31223123if (stencil_res) {3124view.usage |= ISL_SURF_USAGE_STENCIL_BIT;3125info.stencil_aux_usage = stencil_res->aux.usage;3126info.stencil_surf = &stencil_res->surf;3127info.stencil_address = stencil_res->bo->gtt_offset + stencil_res->offset;3128if (!zres) {3129view.format = stencil_res->surf.format;3130info.mocs = iris_mocs(stencil_res->bo, isl_dev, view.usage);3131}3132}3133}31343135isl_emit_depth_stencil_hiz_s(isl_dev, cso_z->packets, &info);31363137/* Make a null surface for unbound buffers */3138void *null_surf_map =3139upload_state(ice->state.surface_uploader, &ice->state.null_fb,31404 * GENX(RENDER_SURFACE_STATE_length), 64);3141isl_null_fill_state(&screen->isl_dev, null_surf_map,3142.size = isl_extent3d(MAX2(cso->width, 1),3143MAX2(cso->height, 1),3144cso->layers ? cso->layers : 1));3145ice->state.null_fb.offset +=3146iris_bo_offset_from_base_address(iris_resource_bo(ice->state.null_fb.res));31473148/* Render target change */3149ice->state.stage_dirty |= IRIS_STAGE_DIRTY_BINDINGS_FS;31503151ice->state.dirty |= IRIS_DIRTY_RENDER_BUFFER;31523153ice->state.dirty |= IRIS_DIRTY_RENDER_RESOLVES_AND_FLUSHES;31543155ice->state.stage_dirty |=3156ice->state.stage_dirty_for_nos[IRIS_NOS_FRAMEBUFFER];31573158if (GFX_VER == 8)3159ice->state.dirty |= IRIS_DIRTY_PMA_FIX;3160}31613162/**3163* The pipe->set_constant_buffer() driver hook.3164*3165* This uploads any constant data in user buffers, and references3166* any UBO resources containing constant data.3167*/3168static void3169iris_set_constant_buffer(struct pipe_context *ctx,3170enum pipe_shader_type p_stage, unsigned index,3171bool take_ownership,3172const struct pipe_constant_buffer *input)3173{3174struct iris_context *ice = (struct iris_context *) ctx;3175gl_shader_stage stage = stage_from_pipe(p_stage);3176struct iris_shader_state *shs = &ice->state.shaders[stage];3177struct pipe_shader_buffer *cbuf = &shs->constbuf[index];31783179/* TODO: Only do this if the buffer changes? */3180pipe_resource_reference(&shs->constbuf_surf_state[index].res, NULL);31813182if (input && input->buffer_size && (input->buffer || input->user_buffer)) {3183shs->bound_cbufs |= 1u << index;31843185if (input->user_buffer) {3186void *map = NULL;3187pipe_resource_reference(&cbuf->buffer, NULL);3188u_upload_alloc(ice->ctx.const_uploader, 0, input->buffer_size, 64,3189&cbuf->buffer_offset, &cbuf->buffer, (void **) &map);31903191if (!cbuf->buffer) {3192/* Allocation was unsuccessful - just unbind */3193iris_set_constant_buffer(ctx, p_stage, index, false, NULL);3194return;3195}31963197assert(map);3198memcpy(map, input->user_buffer, input->buffer_size);3199} else if (input->buffer) {3200if (take_ownership) {3201pipe_resource_reference(&cbuf->buffer, NULL);3202cbuf->buffer = input->buffer;3203} else {3204pipe_resource_reference(&cbuf->buffer, input->buffer);3205}32063207cbuf->buffer_offset = input->buffer_offset;3208}32093210cbuf->buffer_size =3211MIN2(input->buffer_size,3212iris_resource_bo(cbuf->buffer)->size - cbuf->buffer_offset);32133214struct iris_resource *res = (void *) cbuf->buffer;3215res->bind_history |= PIPE_BIND_CONSTANT_BUFFER;3216res->bind_stages |= 1 << stage;3217} else {3218shs->bound_cbufs &= ~(1u << index);3219pipe_resource_reference(&cbuf->buffer, NULL);3220}32213222ice->state.stage_dirty |= IRIS_STAGE_DIRTY_CONSTANTS_VS << stage;3223}32243225static void3226upload_sysvals(struct iris_context *ice,3227gl_shader_stage stage,3228const struct pipe_grid_info *grid)3229{3230UNUSED struct iris_genx_state *genx = ice->state.genx;3231struct iris_shader_state *shs = &ice->state.shaders[stage];32323233struct iris_compiled_shader *shader = ice->shaders.prog[stage];3234if (!shader || (shader->num_system_values == 0 &&3235shader->kernel_input_size == 0))3236return;32373238assert(shader->num_cbufs > 0);32393240unsigned sysval_cbuf_index = shader->num_cbufs - 1;3241struct pipe_shader_buffer *cbuf = &shs->constbuf[sysval_cbuf_index];3242unsigned system_values_start =3243ALIGN(shader->kernel_input_size, sizeof(uint32_t));3244unsigned upload_size = system_values_start +3245shader->num_system_values * sizeof(uint32_t);3246void *map = NULL;32473248assert(sysval_cbuf_index < PIPE_MAX_CONSTANT_BUFFERS);3249u_upload_alloc(ice->ctx.const_uploader, 0, upload_size, 64,3250&cbuf->buffer_offset, &cbuf->buffer, &map);32513252if (shader->kernel_input_size > 0)3253memcpy(map, grid->input, shader->kernel_input_size);32543255uint32_t *sysval_map = map + system_values_start;3256for (int i = 0; i < shader->num_system_values; i++) {3257uint32_t sysval = shader->system_values[i];3258uint32_t value = 0;32593260if (BRW_PARAM_DOMAIN(sysval) == BRW_PARAM_DOMAIN_IMAGE) {3261#if GFX_VER == 83262unsigned img = BRW_PARAM_IMAGE_IDX(sysval);3263unsigned offset = BRW_PARAM_IMAGE_OFFSET(sysval);3264struct brw_image_param *param =3265&genx->shaders[stage].image_param[img];32663267assert(offset < sizeof(struct brw_image_param));3268value = ((uint32_t *) param)[offset];3269#endif3270} else if (sysval == BRW_PARAM_BUILTIN_ZERO) {3271value = 0;3272} else if (BRW_PARAM_BUILTIN_IS_CLIP_PLANE(sysval)) {3273int plane = BRW_PARAM_BUILTIN_CLIP_PLANE_IDX(sysval);3274int comp = BRW_PARAM_BUILTIN_CLIP_PLANE_COMP(sysval);3275value = fui(ice->state.clip_planes.ucp[plane][comp]);3276} else if (sysval == BRW_PARAM_BUILTIN_PATCH_VERTICES_IN) {3277if (stage == MESA_SHADER_TESS_CTRL) {3278value = ice->state.vertices_per_patch;3279} else {3280assert(stage == MESA_SHADER_TESS_EVAL);3281const struct shader_info *tcs_info =3282iris_get_shader_info(ice, MESA_SHADER_TESS_CTRL);3283if (tcs_info)3284value = tcs_info->tess.tcs_vertices_out;3285else3286value = ice->state.vertices_per_patch;3287}3288} else if (sysval >= BRW_PARAM_BUILTIN_TESS_LEVEL_OUTER_X &&3289sysval <= BRW_PARAM_BUILTIN_TESS_LEVEL_OUTER_W) {3290unsigned i = sysval - BRW_PARAM_BUILTIN_TESS_LEVEL_OUTER_X;3291value = fui(ice->state.default_outer_level[i]);3292} else if (sysval == BRW_PARAM_BUILTIN_TESS_LEVEL_INNER_X) {3293value = fui(ice->state.default_inner_level[0]);3294} else if (sysval == BRW_PARAM_BUILTIN_TESS_LEVEL_INNER_Y) {3295value = fui(ice->state.default_inner_level[1]);3296} else if (sysval >= BRW_PARAM_BUILTIN_WORK_GROUP_SIZE_X &&3297sysval <= BRW_PARAM_BUILTIN_WORK_GROUP_SIZE_Z) {3298unsigned i = sysval - BRW_PARAM_BUILTIN_WORK_GROUP_SIZE_X;3299value = ice->state.last_block[i];3300} else if (sysval == BRW_PARAM_BUILTIN_WORK_DIM) {3301value = grid->work_dim;3302} else {3303assert(!"unhandled system value");3304}33053306*sysval_map++ = value;3307}33083309cbuf->buffer_size = upload_size;3310iris_upload_ubo_ssbo_surf_state(ice, cbuf,3311&shs->constbuf_surf_state[sysval_cbuf_index],3312ISL_SURF_USAGE_CONSTANT_BUFFER_BIT);33133314shs->sysvals_need_upload = false;3315}33163317/**3318* The pipe->set_shader_buffers() driver hook.3319*3320* This binds SSBOs and ABOs. Unfortunately, we need to stream out3321* SURFACE_STATE here, as the buffer offset may change each time.3322*/3323static void3324iris_set_shader_buffers(struct pipe_context *ctx,3325enum pipe_shader_type p_stage,3326unsigned start_slot, unsigned count,3327const struct pipe_shader_buffer *buffers,3328unsigned writable_bitmask)3329{3330struct iris_context *ice = (struct iris_context *) ctx;3331gl_shader_stage stage = stage_from_pipe(p_stage);3332struct iris_shader_state *shs = &ice->state.shaders[stage];33333334unsigned modified_bits = u_bit_consecutive(start_slot, count);33353336shs->bound_ssbos &= ~modified_bits;3337shs->writable_ssbos &= ~modified_bits;3338shs->writable_ssbos |= writable_bitmask << start_slot;33393340for (unsigned i = 0; i < count; i++) {3341if (buffers && buffers[i].buffer) {3342struct iris_resource *res = (void *) buffers[i].buffer;3343struct pipe_shader_buffer *ssbo = &shs->ssbo[start_slot + i];3344struct iris_state_ref *surf_state =3345&shs->ssbo_surf_state[start_slot + i];3346pipe_resource_reference(&ssbo->buffer, &res->base.b);3347ssbo->buffer_offset = buffers[i].buffer_offset;3348ssbo->buffer_size =3349MIN2(buffers[i].buffer_size, res->bo->size - ssbo->buffer_offset);33503351shs->bound_ssbos |= 1 << (start_slot + i);33523353isl_surf_usage_flags_t usage = ISL_SURF_USAGE_STORAGE_BIT;33543355iris_upload_ubo_ssbo_surf_state(ice, ssbo, surf_state, usage);33563357res->bind_history |= PIPE_BIND_SHADER_BUFFER;3358res->bind_stages |= 1 << stage;33593360util_range_add(&res->base.b, &res->valid_buffer_range, ssbo->buffer_offset,3361ssbo->buffer_offset + ssbo->buffer_size);3362} else {3363pipe_resource_reference(&shs->ssbo[start_slot + i].buffer, NULL);3364pipe_resource_reference(&shs->ssbo_surf_state[start_slot + i].res,3365NULL);3366}3367}33683369ice->state.stage_dirty |= IRIS_STAGE_DIRTY_BINDINGS_VS << stage;3370}33713372static void3373iris_delete_state(struct pipe_context *ctx, void *state)3374{3375free(state);3376}33773378/**3379* The pipe->set_vertex_buffers() driver hook.3380*3381* This translates pipe_vertex_buffer to our 3DSTATE_VERTEX_BUFFERS packet.3382*/3383static void3384iris_set_vertex_buffers(struct pipe_context *ctx,3385unsigned start_slot, unsigned count,3386unsigned unbind_num_trailing_slots,3387bool take_ownership,3388const struct pipe_vertex_buffer *buffers)3389{3390struct iris_context *ice = (struct iris_context *) ctx;3391struct iris_screen *screen = (struct iris_screen *)ctx->screen;3392struct iris_genx_state *genx = ice->state.genx;33933394ice->state.bound_vertex_buffers &=3395~u_bit_consecutive64(start_slot, count + unbind_num_trailing_slots);33963397for (unsigned i = 0; i < count; i++) {3398const struct pipe_vertex_buffer *buffer = buffers ? &buffers[i] : NULL;3399struct iris_vertex_buffer_state *state =3400&genx->vertex_buffers[start_slot + i];34013402if (!buffer) {3403pipe_resource_reference(&state->resource, NULL);3404continue;3405}34063407/* We may see user buffers that are NULL bindings. */3408assert(!(buffer->is_user_buffer && buffer->buffer.user != NULL));34093410if (take_ownership) {3411pipe_resource_reference(&state->resource, NULL);3412state->resource = buffer->buffer.resource;3413} else {3414pipe_resource_reference(&state->resource, buffer->buffer.resource);3415}3416struct iris_resource *res = (void *) state->resource;34173418state->offset = (int) buffer->buffer_offset;34193420if (res) {3421ice->state.bound_vertex_buffers |= 1ull << (start_slot + i);3422res->bind_history |= PIPE_BIND_VERTEX_BUFFER;3423}34243425iris_pack_state(GENX(VERTEX_BUFFER_STATE), state->state, vb) {3426vb.VertexBufferIndex = start_slot + i;3427vb.AddressModifyEnable = true;3428vb.BufferPitch = buffer->stride;3429if (res) {3430vb.BufferSize = res->base.b.width0 - (int) buffer->buffer_offset;3431vb.BufferStartingAddress =3432ro_bo(NULL, res->bo->gtt_offset + (int) buffer->buffer_offset);3433vb.MOCS = iris_mocs(res->bo, &screen->isl_dev,3434ISL_SURF_USAGE_VERTEX_BUFFER_BIT);3435#if GFX_VER >= 123436vb.L3BypassDisable = true;3437#endif3438} else {3439vb.NullVertexBuffer = true;3440}3441}3442}34433444for (unsigned i = 0; i < unbind_num_trailing_slots; i++) {3445struct iris_vertex_buffer_state *state =3446&genx->vertex_buffers[start_slot + count + i];34473448pipe_resource_reference(&state->resource, NULL);3449}34503451ice->state.dirty |= IRIS_DIRTY_VERTEX_BUFFERS;3452}34533454/**3455* Gallium CSO for vertex elements.3456*/3457struct iris_vertex_element_state {3458uint32_t vertex_elements[1 + 33 * GENX(VERTEX_ELEMENT_STATE_length)];3459uint32_t vf_instancing[33 * GENX(3DSTATE_VF_INSTANCING_length)];3460uint32_t edgeflag_ve[GENX(VERTEX_ELEMENT_STATE_length)];3461uint32_t edgeflag_vfi[GENX(3DSTATE_VF_INSTANCING_length)];3462unsigned count;3463};34643465/**3466* The pipe->create_vertex_elements() driver hook.3467*3468* This translates pipe_vertex_element to our 3DSTATE_VERTEX_ELEMENTS3469* and 3DSTATE_VF_INSTANCING commands. The vertex_elements and vf_instancing3470* arrays are ready to be emitted at draw time if no EdgeFlag or SGVs are3471* needed. In these cases we will need information available at draw time.3472* We setup edgeflag_ve and edgeflag_vfi as alternatives last3473* 3DSTATE_VERTEX_ELEMENT and 3DSTATE_VF_INSTANCING that can be used at3474* draw time if we detect that EdgeFlag is needed by the Vertex Shader.3475*/3476static void *3477iris_create_vertex_elements(struct pipe_context *ctx,3478unsigned count,3479const struct pipe_vertex_element *state)3480{3481struct iris_screen *screen = (struct iris_screen *)ctx->screen;3482const struct intel_device_info *devinfo = &screen->devinfo;3483struct iris_vertex_element_state *cso =3484malloc(sizeof(struct iris_vertex_element_state));34853486cso->count = count;34873488iris_pack_command(GENX(3DSTATE_VERTEX_ELEMENTS), cso->vertex_elements, ve) {3489ve.DWordLength =34901 + GENX(VERTEX_ELEMENT_STATE_length) * MAX2(count, 1) - 2;3491}34923493uint32_t *ve_pack_dest = &cso->vertex_elements[1];3494uint32_t *vfi_pack_dest = cso->vf_instancing;34953496if (count == 0) {3497iris_pack_state(GENX(VERTEX_ELEMENT_STATE), ve_pack_dest, ve) {3498ve.Valid = true;3499ve.SourceElementFormat = ISL_FORMAT_R32G32B32A32_FLOAT;3500ve.Component0Control = VFCOMP_STORE_0;3501ve.Component1Control = VFCOMP_STORE_0;3502ve.Component2Control = VFCOMP_STORE_0;3503ve.Component3Control = VFCOMP_STORE_1_FP;3504}35053506iris_pack_command(GENX(3DSTATE_VF_INSTANCING), vfi_pack_dest, vi) {3507}3508}35093510for (int i = 0; i < count; i++) {3511const struct iris_format_info fmt =3512iris_format_for_usage(devinfo, state[i].src_format, 0);3513unsigned comp[4] = { VFCOMP_STORE_SRC, VFCOMP_STORE_SRC,3514VFCOMP_STORE_SRC, VFCOMP_STORE_SRC };35153516switch (isl_format_get_num_channels(fmt.fmt)) {3517case 0: comp[0] = VFCOMP_STORE_0; FALLTHROUGH;3518case 1: comp[1] = VFCOMP_STORE_0; FALLTHROUGH;3519case 2: comp[2] = VFCOMP_STORE_0; FALLTHROUGH;3520case 3:3521comp[3] = isl_format_has_int_channel(fmt.fmt) ? VFCOMP_STORE_1_INT3522: VFCOMP_STORE_1_FP;3523break;3524}3525iris_pack_state(GENX(VERTEX_ELEMENT_STATE), ve_pack_dest, ve) {3526ve.EdgeFlagEnable = false;3527ve.VertexBufferIndex = state[i].vertex_buffer_index;3528ve.Valid = true;3529ve.SourceElementOffset = state[i].src_offset;3530ve.SourceElementFormat = fmt.fmt;3531ve.Component0Control = comp[0];3532ve.Component1Control = comp[1];3533ve.Component2Control = comp[2];3534ve.Component3Control = comp[3];3535}35363537iris_pack_command(GENX(3DSTATE_VF_INSTANCING), vfi_pack_dest, vi) {3538vi.VertexElementIndex = i;3539vi.InstancingEnable = state[i].instance_divisor > 0;3540vi.InstanceDataStepRate = state[i].instance_divisor;3541}35423543ve_pack_dest += GENX(VERTEX_ELEMENT_STATE_length);3544vfi_pack_dest += GENX(3DSTATE_VF_INSTANCING_length);3545}35463547/* An alternative version of the last VE and VFI is stored so it3548* can be used at draw time in case Vertex Shader uses EdgeFlag3549*/3550if (count) {3551const unsigned edgeflag_index = count - 1;3552const struct iris_format_info fmt =3553iris_format_for_usage(devinfo, state[edgeflag_index].src_format, 0);3554iris_pack_state(GENX(VERTEX_ELEMENT_STATE), cso->edgeflag_ve, ve) {3555ve.EdgeFlagEnable = true ;3556ve.VertexBufferIndex = state[edgeflag_index].vertex_buffer_index;3557ve.Valid = true;3558ve.SourceElementOffset = state[edgeflag_index].src_offset;3559ve.SourceElementFormat = fmt.fmt;3560ve.Component0Control = VFCOMP_STORE_SRC;3561ve.Component1Control = VFCOMP_STORE_0;3562ve.Component2Control = VFCOMP_STORE_0;3563ve.Component3Control = VFCOMP_STORE_0;3564}3565iris_pack_command(GENX(3DSTATE_VF_INSTANCING), cso->edgeflag_vfi, vi) {3566/* The vi.VertexElementIndex of the EdgeFlag Vertex Element is filled3567* at draw time, as it should change if SGVs are emitted.3568*/3569vi.InstancingEnable = state[edgeflag_index].instance_divisor > 0;3570vi.InstanceDataStepRate = state[edgeflag_index].instance_divisor;3571}3572}35733574return cso;3575}35763577/**3578* The pipe->bind_vertex_elements_state() driver hook.3579*/3580static void3581iris_bind_vertex_elements_state(struct pipe_context *ctx, void *state)3582{3583struct iris_context *ice = (struct iris_context *) ctx;3584struct iris_vertex_element_state *old_cso = ice->state.cso_vertex_elements;3585struct iris_vertex_element_state *new_cso = state;35863587/* 3DSTATE_VF_SGVs overrides the last VE, so if the count is changing,3588* we need to re-emit it to ensure we're overriding the right one.3589*/3590if (new_cso && cso_changed(count))3591ice->state.dirty |= IRIS_DIRTY_VF_SGVS;35923593ice->state.cso_vertex_elements = state;3594ice->state.dirty |= IRIS_DIRTY_VERTEX_ELEMENTS;3595}35963597/**3598* The pipe->create_stream_output_target() driver hook.3599*3600* "Target" here refers to a destination buffer. We translate this into3601* a 3DSTATE_SO_BUFFER packet. We can handle most fields, but don't yet3602* know which buffer this represents, or whether we ought to zero the3603* write-offsets, or append. Those are handled in the set() hook.3604*/3605static struct pipe_stream_output_target *3606iris_create_stream_output_target(struct pipe_context *ctx,3607struct pipe_resource *p_res,3608unsigned buffer_offset,3609unsigned buffer_size)3610{3611struct iris_resource *res = (void *) p_res;3612struct iris_stream_output_target *cso = calloc(1, sizeof(*cso));3613if (!cso)3614return NULL;36153616res->bind_history |= PIPE_BIND_STREAM_OUTPUT;36173618pipe_reference_init(&cso->base.reference, 1);3619pipe_resource_reference(&cso->base.buffer, p_res);3620cso->base.buffer_offset = buffer_offset;3621cso->base.buffer_size = buffer_size;3622cso->base.context = ctx;36233624util_range_add(&res->base.b, &res->valid_buffer_range, buffer_offset,3625buffer_offset + buffer_size);36263627return &cso->base;3628}36293630static void3631iris_stream_output_target_destroy(struct pipe_context *ctx,3632struct pipe_stream_output_target *state)3633{3634struct iris_stream_output_target *cso = (void *) state;36353636pipe_resource_reference(&cso->base.buffer, NULL);3637pipe_resource_reference(&cso->offset.res, NULL);36383639free(cso);3640}36413642/**3643* The pipe->set_stream_output_targets() driver hook.3644*3645* At this point, we know which targets are bound to a particular index,3646* and also whether we want to append or start over. We can finish the3647* 3DSTATE_SO_BUFFER packets we started earlier.3648*/3649static void3650iris_set_stream_output_targets(struct pipe_context *ctx,3651unsigned num_targets,3652struct pipe_stream_output_target **targets,3653const unsigned *offsets)3654{3655struct iris_context *ice = (struct iris_context *) ctx;3656struct iris_genx_state *genx = ice->state.genx;3657uint32_t *so_buffers = genx->so_buffers;3658struct iris_screen *screen = (struct iris_screen *)ctx->screen;36593660const bool active = num_targets > 0;3661if (ice->state.streamout_active != active) {3662ice->state.streamout_active = active;3663ice->state.dirty |= IRIS_DIRTY_STREAMOUT;36643665/* We only emit 3DSTATE_SO_DECL_LIST when streamout is active, because3666* it's a non-pipelined command. If we're switching streamout on, we3667* may have missed emitting it earlier, so do so now. (We're already3668* taking a stall to update 3DSTATE_SO_BUFFERS anyway...)3669*/3670if (active) {3671ice->state.dirty |= IRIS_DIRTY_SO_DECL_LIST;3672} else {3673uint32_t flush = 0;3674for (int i = 0; i < PIPE_MAX_SO_BUFFERS; i++) {3675struct iris_stream_output_target *tgt =3676(void *) ice->state.so_target[i];3677if (tgt) {3678struct iris_resource *res = (void *) tgt->base.buffer;36793680flush |= iris_flush_bits_for_history(ice, res);3681iris_dirty_for_history(ice, res);3682}3683}3684#if GFX_VER >= 123685/* SO draws require flushing of const cache to make SO data3686* observable when VB/IB are cached in L3.3687*/3688if (flush & PIPE_CONTROL_VF_CACHE_INVALIDATE)3689flush |= PIPE_CONTROL_CONST_CACHE_INVALIDATE;3690#endif3691iris_emit_pipe_control_flush(&ice->batches[IRIS_BATCH_RENDER],3692"make streamout results visible", flush);3693}3694}36953696for (int i = 0; i < 4; i++) {3697pipe_so_target_reference(&ice->state.so_target[i],3698i < num_targets ? targets[i] : NULL);3699}37003701/* No need to update 3DSTATE_SO_BUFFER unless SOL is active. */3702if (!active)3703return;37043705for (unsigned i = 0; i < 4; i++,3706so_buffers += GENX(3DSTATE_SO_BUFFER_length)) {37073708struct iris_stream_output_target *tgt = (void *) ice->state.so_target[i];3709unsigned offset = offsets[i];37103711if (!tgt) {3712iris_pack_command(GENX(3DSTATE_SO_BUFFER), so_buffers, sob) {3713#if GFX_VER < 123714sob.SOBufferIndex = i;3715#else3716sob._3DCommandOpcode = 0;3717sob._3DCommandSubOpcode = SO_BUFFER_INDEX_0_CMD + i;3718#endif3719}3720continue;3721}37223723if (!tgt->offset.res)3724upload_state(ctx->const_uploader, &tgt->offset, sizeof(uint32_t), 4);37253726struct iris_resource *res = (void *) tgt->base.buffer;37273728/* Note that offsets[i] will either be 0, causing us to zero3729* the value in the buffer, or 0xFFFFFFFF, which happens to mean3730* "continue appending at the existing offset."3731*/3732assert(offset == 0 || offset == 0xFFFFFFFF);37333734/* When we're first called with an offset of 0, we want the next3735* 3DSTATE_SO_BUFFER packets to reset the offset to the beginning.3736* Any further times we emit those packets, we want to use 0xFFFFFFFF3737* to continue appending from the current offset.3738*3739* Note that we might be called by Begin (offset = 0), Pause, then3740* Resume (offset = 0xFFFFFFFF) before ever drawing (where these3741* commands will actually be sent to the GPU). In this case, we3742* don't want to append - we still want to do our initial zeroing.3743*/3744if (offset == 0)3745tgt->zero_offset = true;37463747iris_pack_command(GENX(3DSTATE_SO_BUFFER), so_buffers, sob) {3748#if GFX_VER < 123749sob.SOBufferIndex = i;3750#else3751sob._3DCommandOpcode = 0;3752sob._3DCommandSubOpcode = SO_BUFFER_INDEX_0_CMD + i;3753#endif3754sob.SurfaceBaseAddress =3755rw_bo(NULL, res->bo->gtt_offset + tgt->base.buffer_offset,3756IRIS_DOMAIN_OTHER_WRITE);3757sob.SOBufferEnable = true;3758sob.StreamOffsetWriteEnable = true;3759sob.StreamOutputBufferOffsetAddressEnable = true;3760sob.MOCS = iris_mocs(res->bo, &screen->isl_dev, 0);37613762sob.SurfaceSize = MAX2(tgt->base.buffer_size / 4, 1) - 1;3763sob.StreamOutputBufferOffsetAddress =3764rw_bo(NULL, iris_resource_bo(tgt->offset.res)->gtt_offset +3765tgt->offset.offset, IRIS_DOMAIN_OTHER_WRITE);3766sob.StreamOffset = 0xFFFFFFFF; /* not offset, see above */3767}3768}37693770ice->state.dirty |= IRIS_DIRTY_SO_BUFFERS;3771}37723773/**3774* An iris-vtable helper for encoding the 3DSTATE_SO_DECL_LIST and3775* 3DSTATE_STREAMOUT packets.3776*3777* 3DSTATE_SO_DECL_LIST is a list of shader outputs we want the streamout3778* hardware to record. We can create it entirely based on the shader, with3779* no dynamic state dependencies.3780*3781* 3DSTATE_STREAMOUT is an annoying mix of shader-based information and3782* state-based settings. We capture the shader-related ones here, and merge3783* the rest in at draw time.3784*/3785static uint32_t *3786iris_create_so_decl_list(const struct pipe_stream_output_info *info,3787const struct brw_vue_map *vue_map)3788{3789struct GENX(SO_DECL) so_decl[MAX_VERTEX_STREAMS][128];3790int buffer_mask[MAX_VERTEX_STREAMS] = {0, 0, 0, 0};3791int next_offset[MAX_VERTEX_STREAMS] = {0, 0, 0, 0};3792int decls[MAX_VERTEX_STREAMS] = {0, 0, 0, 0};3793int max_decls = 0;3794STATIC_ASSERT(ARRAY_SIZE(so_decl[0]) >= MAX_PROGRAM_OUTPUTS);37953796memset(so_decl, 0, sizeof(so_decl));37973798/* Construct the list of SO_DECLs to be emitted. The formatting of the3799* command feels strange -- each dword pair contains a SO_DECL per stream.3800*/3801for (unsigned i = 0; i < info->num_outputs; i++) {3802const struct pipe_stream_output *output = &info->output[i];3803const int buffer = output->output_buffer;3804const int varying = output->register_index;3805const unsigned stream_id = output->stream;3806assert(stream_id < MAX_VERTEX_STREAMS);38073808buffer_mask[stream_id] |= 1 << buffer;38093810assert(vue_map->varying_to_slot[varying] >= 0);38113812/* Mesa doesn't store entries for gl_SkipComponents in the Outputs[]3813* array. Instead, it simply increments DstOffset for the following3814* input by the number of components that should be skipped.3815*3816* Our hardware is unusual in that it requires us to program SO_DECLs3817* for fake "hole" components, rather than simply taking the offset3818* for each real varying. Each hole can have size 1, 2, 3, or 4; we3819* program as many size = 4 holes as we can, then a final hole to3820* accommodate the final 1, 2, or 3 remaining.3821*/3822int skip_components = output->dst_offset - next_offset[buffer];38233824while (skip_components > 0) {3825so_decl[stream_id][decls[stream_id]++] = (struct GENX(SO_DECL)) {3826.HoleFlag = 1,3827.OutputBufferSlot = output->output_buffer,3828.ComponentMask = (1 << MIN2(skip_components, 4)) - 1,3829};3830skip_components -= 4;3831}38323833next_offset[buffer] = output->dst_offset + output->num_components;38343835so_decl[stream_id][decls[stream_id]++] = (struct GENX(SO_DECL)) {3836.OutputBufferSlot = output->output_buffer,3837.RegisterIndex = vue_map->varying_to_slot[varying],3838.ComponentMask =3839((1 << output->num_components) - 1) << output->start_component,3840};38413842if (decls[stream_id] > max_decls)3843max_decls = decls[stream_id];3844}38453846unsigned dwords = GENX(3DSTATE_STREAMOUT_length) + (3 + 2 * max_decls);3847uint32_t *map = ralloc_size(NULL, sizeof(uint32_t) * dwords);3848uint32_t *so_decl_map = map + GENX(3DSTATE_STREAMOUT_length);38493850iris_pack_command(GENX(3DSTATE_STREAMOUT), map, sol) {3851int urb_entry_read_offset = 0;3852int urb_entry_read_length = (vue_map->num_slots + 1) / 2 -3853urb_entry_read_offset;38543855/* We always read the whole vertex. This could be reduced at some3856* point by reading less and offsetting the register index in the3857* SO_DECLs.3858*/3859sol.Stream0VertexReadOffset = urb_entry_read_offset;3860sol.Stream0VertexReadLength = urb_entry_read_length - 1;3861sol.Stream1VertexReadOffset = urb_entry_read_offset;3862sol.Stream1VertexReadLength = urb_entry_read_length - 1;3863sol.Stream2VertexReadOffset = urb_entry_read_offset;3864sol.Stream2VertexReadLength = urb_entry_read_length - 1;3865sol.Stream3VertexReadOffset = urb_entry_read_offset;3866sol.Stream3VertexReadLength = urb_entry_read_length - 1;38673868/* Set buffer pitches; 0 means unbound. */3869sol.Buffer0SurfacePitch = 4 * info->stride[0];3870sol.Buffer1SurfacePitch = 4 * info->stride[1];3871sol.Buffer2SurfacePitch = 4 * info->stride[2];3872sol.Buffer3SurfacePitch = 4 * info->stride[3];3873}38743875iris_pack_command(GENX(3DSTATE_SO_DECL_LIST), so_decl_map, list) {3876list.DWordLength = 3 + 2 * max_decls - 2;3877list.StreamtoBufferSelects0 = buffer_mask[0];3878list.StreamtoBufferSelects1 = buffer_mask[1];3879list.StreamtoBufferSelects2 = buffer_mask[2];3880list.StreamtoBufferSelects3 = buffer_mask[3];3881list.NumEntries0 = decls[0];3882list.NumEntries1 = decls[1];3883list.NumEntries2 = decls[2];3884list.NumEntries3 = decls[3];3885}38863887for (int i = 0; i < max_decls; i++) {3888iris_pack_state(GENX(SO_DECL_ENTRY), so_decl_map + 3 + i * 2, entry) {3889entry.Stream0Decl = so_decl[0][i];3890entry.Stream1Decl = so_decl[1][i];3891entry.Stream2Decl = so_decl[2][i];3892entry.Stream3Decl = so_decl[3][i];3893}3894}38953896return map;3897}38983899static void3900iris_compute_sbe_urb_read_interval(uint64_t fs_input_slots,3901const struct brw_vue_map *last_vue_map,3902bool two_sided_color,3903unsigned *out_offset,3904unsigned *out_length)3905{3906/* The compiler computes the first URB slot without considering COL/BFC3907* swizzling (because it doesn't know whether it's enabled), so we need3908* to do that here too. This may result in a smaller offset, which3909* should be safe.3910*/3911const unsigned first_slot =3912brw_compute_first_urb_slot_required(fs_input_slots, last_vue_map);39133914/* This becomes the URB read offset (counted in pairs of slots). */3915assert(first_slot % 2 == 0);3916*out_offset = first_slot / 2;39173918/* We need to adjust the inputs read to account for front/back color3919* swizzling, as it can make the URB length longer.3920*/3921for (int c = 0; c <= 1; c++) {3922if (fs_input_slots & (VARYING_BIT_COL0 << c)) {3923/* If two sided color is enabled, the fragment shader's gl_Color3924* (COL0) input comes from either the gl_FrontColor (COL0) or3925* gl_BackColor (BFC0) input varyings. Mark BFC as used, too.3926*/3927if (two_sided_color)3928fs_input_slots |= (VARYING_BIT_BFC0 << c);39293930/* If front color isn't written, we opt to give them back color3931* instead of an undefined value. Switch from COL to BFC.3932*/3933if (last_vue_map->varying_to_slot[VARYING_SLOT_COL0 + c] == -1) {3934fs_input_slots &= ~(VARYING_BIT_COL0 << c);3935fs_input_slots |= (VARYING_BIT_BFC0 << c);3936}3937}3938}39393940/* Compute the minimum URB Read Length necessary for the FS inputs.3941*3942* From the Sandy Bridge PRM, Volume 2, Part 1, documentation for3943* 3DSTATE_SF DWord 1 bits 15:11, "Vertex URB Entry Read Length":3944*3945* "This field should be set to the minimum length required to read the3946* maximum source attribute. The maximum source attribute is indicated3947* by the maximum value of the enabled Attribute # Source Attribute if3948* Attribute Swizzle Enable is set, Number of Output Attributes-1 if3949* enable is not set.3950* read_length = ceiling((max_source_attr + 1) / 2)3951*3952* [errata] Corruption/Hang possible if length programmed larger than3953* recommended"3954*3955* Similar text exists for Ivy Bridge.3956*3957* We find the last URB slot that's actually read by the FS.3958*/3959unsigned last_read_slot = last_vue_map->num_slots - 1;3960while (last_read_slot > first_slot && !(fs_input_slots &3961(1ull << last_vue_map->slot_to_varying[last_read_slot])))3962--last_read_slot;39633964/* The URB read length is the difference of the two, counted in pairs. */3965*out_length = DIV_ROUND_UP(last_read_slot - first_slot + 1, 2);3966}39673968static void3969iris_emit_sbe_swiz(struct iris_batch *batch,3970const struct iris_context *ice,3971const struct brw_vue_map *vue_map,3972unsigned urb_read_offset,3973unsigned sprite_coord_enables)3974{3975struct GENX(SF_OUTPUT_ATTRIBUTE_DETAIL) attr_overrides[16] = {};3976const struct brw_wm_prog_data *wm_prog_data = (void *)3977ice->shaders.prog[MESA_SHADER_FRAGMENT]->prog_data;3978const struct iris_rasterizer_state *cso_rast = ice->state.cso_rast;39793980/* XXX: this should be generated when putting programs in place */39813982for (uint8_t idx = 0; idx < wm_prog_data->urb_setup_attribs_count; idx++) {3983const uint8_t fs_attr = wm_prog_data->urb_setup_attribs[idx];3984const int input_index = wm_prog_data->urb_setup[fs_attr];3985if (input_index < 0 || input_index >= 16)3986continue;39873988struct GENX(SF_OUTPUT_ATTRIBUTE_DETAIL) *attr =3989&attr_overrides[input_index];3990int slot = vue_map->varying_to_slot[fs_attr];39913992/* Viewport and Layer are stored in the VUE header. We need to override3993* them to zero if earlier stages didn't write them, as GL requires that3994* they read back as zero when not explicitly set.3995*/3996switch (fs_attr) {3997case VARYING_SLOT_VIEWPORT:3998case VARYING_SLOT_LAYER:3999attr->ComponentOverrideX = true;4000attr->ComponentOverrideW = true;4001attr->ConstantSource = CONST_0000;40024003if (!(vue_map->slots_valid & VARYING_BIT_LAYER))4004attr->ComponentOverrideY = true;4005if (!(vue_map->slots_valid & VARYING_BIT_VIEWPORT))4006attr->ComponentOverrideZ = true;4007continue;40084009case VARYING_SLOT_PRIMITIVE_ID:4010/* Override if the previous shader stage didn't write gl_PrimitiveID. */4011if (slot == -1) {4012attr->ComponentOverrideX = true;4013attr->ComponentOverrideY = true;4014attr->ComponentOverrideZ = true;4015attr->ComponentOverrideW = true;4016attr->ConstantSource = PRIM_ID;4017continue;4018}4019break;40204021default:4022break;4023}40244025if (sprite_coord_enables & (1 << input_index))4026continue;40274028/* If there was only a back color written but not front, use back4029* as the color instead of undefined.4030*/4031if (slot == -1 && fs_attr == VARYING_SLOT_COL0)4032slot = vue_map->varying_to_slot[VARYING_SLOT_BFC0];4033if (slot == -1 && fs_attr == VARYING_SLOT_COL1)4034slot = vue_map->varying_to_slot[VARYING_SLOT_BFC1];40354036/* Not written by the previous stage - undefined. */4037if (slot == -1) {4038attr->ComponentOverrideX = true;4039attr->ComponentOverrideY = true;4040attr->ComponentOverrideZ = true;4041attr->ComponentOverrideW = true;4042attr->ConstantSource = CONST_0001_FLOAT;4043continue;4044}40454046/* Compute the location of the attribute relative to the read offset,4047* which is counted in 256-bit increments (two 128-bit VUE slots).4048*/4049const int source_attr = slot - 2 * urb_read_offset;4050assert(source_attr >= 0 && source_attr <= 32);4051attr->SourceAttribute = source_attr;40524053/* If we are doing two-sided color, and the VUE slot following this one4054* represents a back-facing color, then we need to instruct the SF unit4055* to do back-facing swizzling.4056*/4057if (cso_rast->light_twoside &&4058((vue_map->slot_to_varying[slot] == VARYING_SLOT_COL0 &&4059vue_map->slot_to_varying[slot+1] == VARYING_SLOT_BFC0) ||4060(vue_map->slot_to_varying[slot] == VARYING_SLOT_COL1 &&4061vue_map->slot_to_varying[slot+1] == VARYING_SLOT_BFC1)))4062attr->SwizzleSelect = INPUTATTR_FACING;4063}40644065iris_emit_cmd(batch, GENX(3DSTATE_SBE_SWIZ), sbes) {4066for (int i = 0; i < 16; i++)4067sbes.Attribute[i] = attr_overrides[i];4068}4069}40704071static bool4072iris_is_drawing_points(const struct iris_context *ice)4073{4074const struct iris_rasterizer_state *cso_rast = ice->state.cso_rast;40754076if (cso_rast->fill_mode_point) {4077return true;4078}40794080if (ice->shaders.prog[MESA_SHADER_GEOMETRY]) {4081const struct brw_gs_prog_data *gs_prog_data =4082(void *) ice->shaders.prog[MESA_SHADER_GEOMETRY]->prog_data;4083return gs_prog_data->output_topology == _3DPRIM_POINTLIST;4084} else if (ice->shaders.prog[MESA_SHADER_TESS_EVAL]) {4085const struct brw_tes_prog_data *tes_data =4086(void *) ice->shaders.prog[MESA_SHADER_TESS_EVAL]->prog_data;4087return tes_data->output_topology == BRW_TESS_OUTPUT_TOPOLOGY_POINT;4088} else {4089return ice->state.prim_mode == PIPE_PRIM_POINTS;4090}4091}40924093static unsigned4094iris_calculate_point_sprite_overrides(const struct brw_wm_prog_data *prog_data,4095const struct iris_rasterizer_state *cso)4096{4097unsigned overrides = 0;40984099if (prog_data->urb_setup[VARYING_SLOT_PNTC] != -1)4100overrides |= 1 << prog_data->urb_setup[VARYING_SLOT_PNTC];41014102for (int i = 0; i < 8; i++) {4103if ((cso->sprite_coord_enable & (1 << i)) &&4104prog_data->urb_setup[VARYING_SLOT_TEX0 + i] != -1)4105overrides |= 1 << prog_data->urb_setup[VARYING_SLOT_TEX0 + i];4106}41074108return overrides;4109}41104111static void4112iris_emit_sbe(struct iris_batch *batch, const struct iris_context *ice)4113{4114const struct iris_rasterizer_state *cso_rast = ice->state.cso_rast;4115const struct brw_wm_prog_data *wm_prog_data = (void *)4116ice->shaders.prog[MESA_SHADER_FRAGMENT]->prog_data;4117const struct shader_info *fs_info =4118iris_get_shader_info(ice, MESA_SHADER_FRAGMENT);4119const struct brw_vue_map *last_vue_map =4120&brw_vue_prog_data(ice->shaders.last_vue_shader->prog_data)->vue_map;41214122unsigned urb_read_offset, urb_read_length;4123iris_compute_sbe_urb_read_interval(fs_info->inputs_read,4124last_vue_map,4125cso_rast->light_twoside,4126&urb_read_offset, &urb_read_length);41274128unsigned sprite_coord_overrides =4129iris_is_drawing_points(ice) ?4130iris_calculate_point_sprite_overrides(wm_prog_data, cso_rast) : 0;41314132iris_emit_cmd(batch, GENX(3DSTATE_SBE), sbe) {4133sbe.AttributeSwizzleEnable = true;4134sbe.NumberofSFOutputAttributes = wm_prog_data->num_varying_inputs;4135sbe.PointSpriteTextureCoordinateOrigin = cso_rast->sprite_coord_mode;4136sbe.VertexURBEntryReadOffset = urb_read_offset;4137sbe.VertexURBEntryReadLength = urb_read_length;4138sbe.ForceVertexURBEntryReadOffset = true;4139sbe.ForceVertexURBEntryReadLength = true;4140sbe.ConstantInterpolationEnable = wm_prog_data->flat_inputs;4141sbe.PointSpriteTextureCoordinateEnable = sprite_coord_overrides;4142#if GFX_VER >= 94143for (int i = 0; i < 32; i++) {4144sbe.AttributeActiveComponentFormat[i] = ACTIVE_COMPONENT_XYZW;4145}4146#endif4147}41484149iris_emit_sbe_swiz(batch, ice, last_vue_map, urb_read_offset,4150sprite_coord_overrides);4151}41524153/* ------------------------------------------------------------------- */41544155/**4156* Populate VS program key fields based on the current state.4157*/4158static void4159iris_populate_vs_key(const struct iris_context *ice,4160const struct shader_info *info,4161gl_shader_stage last_stage,4162struct iris_vs_prog_key *key)4163{4164const struct iris_rasterizer_state *cso_rast = ice->state.cso_rast;41654166if (info->clip_distance_array_size == 0 &&4167(info->outputs_written & (VARYING_BIT_POS | VARYING_BIT_CLIP_VERTEX)) &&4168last_stage == MESA_SHADER_VERTEX)4169key->vue.nr_userclip_plane_consts = cso_rast->num_clip_plane_consts;4170}41714172/**4173* Populate TCS program key fields based on the current state.4174*/4175static void4176iris_populate_tcs_key(const struct iris_context *ice,4177struct iris_tcs_prog_key *key)4178{4179}41804181/**4182* Populate TES program key fields based on the current state.4183*/4184static void4185iris_populate_tes_key(const struct iris_context *ice,4186const struct shader_info *info,4187gl_shader_stage last_stage,4188struct iris_tes_prog_key *key)4189{4190const struct iris_rasterizer_state *cso_rast = ice->state.cso_rast;41914192if (info->clip_distance_array_size == 0 &&4193(info->outputs_written & (VARYING_BIT_POS | VARYING_BIT_CLIP_VERTEX)) &&4194last_stage == MESA_SHADER_TESS_EVAL)4195key->vue.nr_userclip_plane_consts = cso_rast->num_clip_plane_consts;4196}41974198/**4199* Populate GS program key fields based on the current state.4200*/4201static void4202iris_populate_gs_key(const struct iris_context *ice,4203const struct shader_info *info,4204gl_shader_stage last_stage,4205struct iris_gs_prog_key *key)4206{4207const struct iris_rasterizer_state *cso_rast = ice->state.cso_rast;42084209if (info->clip_distance_array_size == 0 &&4210(info->outputs_written & (VARYING_BIT_POS | VARYING_BIT_CLIP_VERTEX)) &&4211last_stage == MESA_SHADER_GEOMETRY)4212key->vue.nr_userclip_plane_consts = cso_rast->num_clip_plane_consts;4213}42144215/**4216* Populate FS program key fields based on the current state.4217*/4218static void4219iris_populate_fs_key(const struct iris_context *ice,4220const struct shader_info *info,4221struct iris_fs_prog_key *key)4222{4223struct iris_screen *screen = (void *) ice->ctx.screen;4224const struct pipe_framebuffer_state *fb = &ice->state.framebuffer;4225const struct iris_depth_stencil_alpha_state *zsa = ice->state.cso_zsa;4226const struct iris_rasterizer_state *rast = ice->state.cso_rast;4227const struct iris_blend_state *blend = ice->state.cso_blend;42284229key->nr_color_regions = fb->nr_cbufs;42304231key->clamp_fragment_color = rast->clamp_fragment_color;42324233key->alpha_to_coverage = blend->alpha_to_coverage;42344235key->alpha_test_replicate_alpha = fb->nr_cbufs > 1 && zsa->alpha_enabled;42364237key->flat_shade = rast->flatshade &&4238(info->inputs_read & (VARYING_BIT_COL0 | VARYING_BIT_COL1));42394240key->persample_interp = rast->force_persample_interp;4241key->multisample_fbo = rast->multisample && fb->samples > 1;42424243key->coherent_fb_fetch = GFX_VER >= 9;42444245key->force_dual_color_blend =4246screen->driconf.dual_color_blend_by_location &&4247(blend->blend_enables & 1) && blend->dual_color_blending;42484249/* TODO: Respect glHint for key->high_quality_derivatives */4250}42514252static void4253iris_populate_cs_key(const struct iris_context *ice,4254struct iris_cs_prog_key *key)4255{4256}42574258static uint64_t4259KSP(const struct iris_compiled_shader *shader)4260{4261struct iris_resource *res = (void *) shader->assembly.res;4262return iris_bo_offset_from_base_address(res->bo) + shader->assembly.offset;4263}42644265#define INIT_THREAD_DISPATCH_FIELDS(pkt, prefix, stage) \4266pkt.KernelStartPointer = KSP(shader); \4267pkt.BindingTableEntryCount = shader->bt.size_bytes / 4; \4268pkt.FloatingPointMode = prog_data->use_alt_mode; \4269\4270pkt.DispatchGRFStartRegisterForURBData = \4271prog_data->dispatch_grf_start_reg; \4272pkt.prefix##URBEntryReadLength = vue_prog_data->urb_read_length; \4273pkt.prefix##URBEntryReadOffset = 0; \4274\4275pkt.StatisticsEnable = true; \4276pkt.Enable = true; \4277\4278if (prog_data->total_scratch) { \4279INIT_THREAD_SCRATCH_SIZE(pkt) \4280}42814282#if GFX_VERx10 >= 1254283#define INIT_THREAD_SCRATCH_SIZE(pkt)4284#define MERGE_SCRATCH_ADDR(name) \4285{ \4286uint32_t pkt2[GENX(name##_length)] = {0}; \4287_iris_pack_command(batch, GENX(name), pkt2, p) { \4288p.ScratchSpaceBuffer = scratch_addr >> 4; \4289} \4290iris_emit_merge(batch, pkt, pkt2, GENX(name##_length)); \4291}4292#else4293#define INIT_THREAD_SCRATCH_SIZE(pkt) \4294pkt.PerThreadScratchSpace = ffs(prog_data->total_scratch) - 11;4295#define MERGE_SCRATCH_ADDR(name) \4296{ \4297uint32_t pkt2[GENX(name##_length)] = {0}; \4298_iris_pack_command(batch, GENX(name), pkt2, p) { \4299p.ScratchSpaceBasePointer = \4300rw_bo(NULL, scratch_addr, IRIS_DOMAIN_NONE); \4301} \4302iris_emit_merge(batch, pkt, pkt2, GENX(name##_length)); \4303}4304#endif430543064307/**4308* Encode most of 3DSTATE_VS based on the compiled shader.4309*/4310static void4311iris_store_vs_state(const struct intel_device_info *devinfo,4312struct iris_compiled_shader *shader)4313{4314struct brw_stage_prog_data *prog_data = shader->prog_data;4315struct brw_vue_prog_data *vue_prog_data = (void *) prog_data;43164317iris_pack_command(GENX(3DSTATE_VS), shader->derived_data, vs) {4318INIT_THREAD_DISPATCH_FIELDS(vs, Vertex, MESA_SHADER_VERTEX);4319vs.MaximumNumberofThreads = devinfo->max_vs_threads - 1;4320vs.SIMD8DispatchEnable = true;4321vs.UserClipDistanceCullTestEnableBitmask =4322vue_prog_data->cull_distance_mask;4323}4324}43254326/**4327* Encode most of 3DSTATE_HS based on the compiled shader.4328*/4329static void4330iris_store_tcs_state(const struct intel_device_info *devinfo,4331struct iris_compiled_shader *shader)4332{4333struct brw_stage_prog_data *prog_data = shader->prog_data;4334struct brw_vue_prog_data *vue_prog_data = (void *) prog_data;4335struct brw_tcs_prog_data *tcs_prog_data = (void *) prog_data;43364337iris_pack_command(GENX(3DSTATE_HS), shader->derived_data, hs) {4338INIT_THREAD_DISPATCH_FIELDS(hs, Vertex, MESA_SHADER_TESS_CTRL);43394340#if GFX_VER >= 124341/* Wa_1604578095:4342*4343* Hang occurs when the number of max threads is less than 2 times4344* the number of instance count. The number of max threads must be4345* more than 2 times the number of instance count.4346*/4347assert((devinfo->max_tcs_threads / 2) > tcs_prog_data->instances);4348hs.DispatchGRFStartRegisterForURBData = prog_data->dispatch_grf_start_reg & 0x1f;4349hs.DispatchGRFStartRegisterForURBData5 = prog_data->dispatch_grf_start_reg >> 5;4350#endif43514352hs.InstanceCount = tcs_prog_data->instances - 1;4353hs.MaximumNumberofThreads = devinfo->max_tcs_threads - 1;4354hs.IncludeVertexHandles = true;43554356#if GFX_VER == 124357/* Patch Count threshold specifies the maximum number of patches that4358* will be accumulated before a thread dispatch is forced.4359*/4360hs.PatchCountThreshold = tcs_prog_data->patch_count_threshold;4361#endif43624363#if GFX_VER >= 94364hs.DispatchMode = vue_prog_data->dispatch_mode;4365hs.IncludePrimitiveID = tcs_prog_data->include_primitive_id;4366#endif4367}4368}43694370/**4371* Encode 3DSTATE_TE and most of 3DSTATE_DS based on the compiled shader.4372*/4373static void4374iris_store_tes_state(const struct intel_device_info *devinfo,4375struct iris_compiled_shader *shader)4376{4377struct brw_stage_prog_data *prog_data = shader->prog_data;4378struct brw_vue_prog_data *vue_prog_data = (void *) prog_data;4379struct brw_tes_prog_data *tes_prog_data = (void *) prog_data;43804381uint32_t *te_state = (void *) shader->derived_data;4382uint32_t *ds_state = te_state + GENX(3DSTATE_TE_length);43834384iris_pack_command(GENX(3DSTATE_TE), te_state, te) {4385te.Partitioning = tes_prog_data->partitioning;4386te.OutputTopology = tes_prog_data->output_topology;4387te.TEDomain = tes_prog_data->domain;4388te.TEEnable = true;4389te.MaximumTessellationFactorOdd = 63.0;4390te.MaximumTessellationFactorNotOdd = 64.0;4391}43924393iris_pack_command(GENX(3DSTATE_DS), ds_state, ds) {4394INIT_THREAD_DISPATCH_FIELDS(ds, Patch, MESA_SHADER_TESS_EVAL);43954396ds.DispatchMode = DISPATCH_MODE_SIMD8_SINGLE_PATCH;4397ds.MaximumNumberofThreads = devinfo->max_tes_threads - 1;4398ds.ComputeWCoordinateEnable =4399tes_prog_data->domain == BRW_TESS_DOMAIN_TRI;44004401ds.UserClipDistanceCullTestEnableBitmask =4402vue_prog_data->cull_distance_mask;4403}44044405}44064407/**4408* Encode most of 3DSTATE_GS based on the compiled shader.4409*/4410static void4411iris_store_gs_state(const struct intel_device_info *devinfo,4412struct iris_compiled_shader *shader)4413{4414struct brw_stage_prog_data *prog_data = shader->prog_data;4415struct brw_vue_prog_data *vue_prog_data = (void *) prog_data;4416struct brw_gs_prog_data *gs_prog_data = (void *) prog_data;44174418iris_pack_command(GENX(3DSTATE_GS), shader->derived_data, gs) {4419INIT_THREAD_DISPATCH_FIELDS(gs, Vertex, MESA_SHADER_GEOMETRY);44204421gs.OutputVertexSize = gs_prog_data->output_vertex_size_hwords * 2 - 1;4422gs.OutputTopology = gs_prog_data->output_topology;4423gs.ControlDataHeaderSize =4424gs_prog_data->control_data_header_size_hwords;4425gs.InstanceControl = gs_prog_data->invocations - 1;4426gs.DispatchMode = DISPATCH_MODE_SIMD8;4427gs.IncludePrimitiveID = gs_prog_data->include_primitive_id;4428gs.ControlDataFormat = gs_prog_data->control_data_format;4429gs.ReorderMode = TRAILING;4430gs.ExpectedVertexCount = gs_prog_data->vertices_in;4431gs.MaximumNumberofThreads =4432GFX_VER == 8 ? (devinfo->max_gs_threads / 2 - 1)4433: (devinfo->max_gs_threads - 1);44344435if (gs_prog_data->static_vertex_count != -1) {4436gs.StaticOutput = true;4437gs.StaticOutputVertexCount = gs_prog_data->static_vertex_count;4438}4439gs.IncludeVertexHandles = vue_prog_data->include_vue_handles;44404441gs.UserClipDistanceCullTestEnableBitmask =4442vue_prog_data->cull_distance_mask;44434444const int urb_entry_write_offset = 1;4445const uint32_t urb_entry_output_length =4446DIV_ROUND_UP(vue_prog_data->vue_map.num_slots, 2) -4447urb_entry_write_offset;44484449gs.VertexURBEntryOutputReadOffset = urb_entry_write_offset;4450gs.VertexURBEntryOutputLength = MAX2(urb_entry_output_length, 1);4451}4452}44534454/**4455* Encode most of 3DSTATE_PS and 3DSTATE_PS_EXTRA based on the shader.4456*/4457static void4458iris_store_fs_state(const struct intel_device_info *devinfo,4459struct iris_compiled_shader *shader)4460{4461struct brw_stage_prog_data *prog_data = shader->prog_data;4462struct brw_wm_prog_data *wm_prog_data = (void *) shader->prog_data;44634464uint32_t *ps_state = (void *) shader->derived_data;4465uint32_t *psx_state = ps_state + GENX(3DSTATE_PS_length);44664467iris_pack_command(GENX(3DSTATE_PS), ps_state, ps) {4468ps.VectorMaskEnable = true;4469ps.BindingTableEntryCount = shader->bt.size_bytes / 4;4470ps.FloatingPointMode = prog_data->use_alt_mode;4471ps.MaximumNumberofThreadsPerPSD = 64 - (GFX_VER == 8 ? 2 : 1);44724473ps.PushConstantEnable = prog_data->ubo_ranges[0].length > 0;44744475/* From the documentation for this packet:4476* "If the PS kernel does not need the Position XY Offsets to4477* compute a Position Value, then this field should be programmed4478* to POSOFFSET_NONE."4479*4480* "SW Recommendation: If the PS kernel needs the Position Offsets4481* to compute a Position XY value, this field should match Position4482* ZW Interpolation Mode to ensure a consistent position.xyzw4483* computation."4484*4485* We only require XY sample offsets. So, this recommendation doesn't4486* look useful at the moment. We might need this in future.4487*/4488ps.PositionXYOffsetSelect =4489wm_prog_data->uses_pos_offset ? POSOFFSET_SAMPLE : POSOFFSET_NONE;44904491if (prog_data->total_scratch) {4492INIT_THREAD_SCRATCH_SIZE(ps);4493}4494}44954496iris_pack_command(GENX(3DSTATE_PS_EXTRA), psx_state, psx) {4497psx.PixelShaderValid = true;4498psx.PixelShaderComputedDepthMode = wm_prog_data->computed_depth_mode;4499psx.PixelShaderKillsPixel = wm_prog_data->uses_kill;4500psx.AttributeEnable = wm_prog_data->num_varying_inputs != 0;4501psx.PixelShaderUsesSourceDepth = wm_prog_data->uses_src_depth;4502psx.PixelShaderUsesSourceW = wm_prog_data->uses_src_w;4503psx.PixelShaderIsPerSample = wm_prog_data->persample_dispatch;4504psx.oMaskPresenttoRenderTarget = wm_prog_data->uses_omask;45054506#if GFX_VER >= 94507psx.PixelShaderPullsBary = wm_prog_data->pulls_bary;4508psx.PixelShaderComputesStencil = wm_prog_data->computed_stencil;4509#endif4510}4511}45124513/**4514* Compute the size of the derived data (shader command packets).4515*4516* This must match the data written by the iris_store_xs_state() functions.4517*/4518static void4519iris_store_cs_state(const struct intel_device_info *devinfo,4520struct iris_compiled_shader *shader)4521{4522struct brw_cs_prog_data *cs_prog_data = (void *) shader->prog_data;4523void *map = shader->derived_data;45244525iris_pack_state(GENX(INTERFACE_DESCRIPTOR_DATA), map, desc) {4526#if GFX_VERx10 < 1254527desc.ConstantURBEntryReadLength = cs_prog_data->push.per_thread.regs;4528desc.CrossThreadConstantDataReadLength =4529cs_prog_data->push.cross_thread.regs;4530#else4531assert(cs_prog_data->push.per_thread.regs == 0);4532assert(cs_prog_data->push.cross_thread.regs == 0);4533#endif4534desc.BarrierEnable = cs_prog_data->uses_barrier;4535#if GFX_VER >= 124536/* TODO: Check if we are missing workarounds and enable mid-thread4537* preemption.4538*4539* We still have issues with mid-thread preemption (it was already4540* disabled by the kernel on gfx11, due to missing workarounds). It's4541* possible that we are just missing some workarounds, and could enable4542* it later, but for now let's disable it to fix a GPU in compute in Car4543* Chase (and possibly more).4544*/4545desc.ThreadPreemptionDisable = true;4546#endif4547}4548}45494550static unsigned4551iris_derived_program_state_size(enum iris_program_cache_id cache_id)4552{4553assert(cache_id <= IRIS_CACHE_BLORP);45544555static const unsigned dwords[] = {4556[IRIS_CACHE_VS] = GENX(3DSTATE_VS_length),4557[IRIS_CACHE_TCS] = GENX(3DSTATE_HS_length),4558[IRIS_CACHE_TES] = GENX(3DSTATE_TE_length) + GENX(3DSTATE_DS_length),4559[IRIS_CACHE_GS] = GENX(3DSTATE_GS_length),4560[IRIS_CACHE_FS] =4561GENX(3DSTATE_PS_length) + GENX(3DSTATE_PS_EXTRA_length),4562[IRIS_CACHE_CS] = GENX(INTERFACE_DESCRIPTOR_DATA_length),4563[IRIS_CACHE_BLORP] = 0,4564};45654566return sizeof(uint32_t) * dwords[cache_id];4567}45684569/**4570* Create any state packets corresponding to the given shader stage4571* (i.e. 3DSTATE_VS) and save them as "derived data" in the shader variant.4572* This means that we can look up a program in the in-memory cache and4573* get most of the state packet without having to reconstruct it.4574*/4575static void4576iris_store_derived_program_state(const struct intel_device_info *devinfo,4577enum iris_program_cache_id cache_id,4578struct iris_compiled_shader *shader)4579{4580switch (cache_id) {4581case IRIS_CACHE_VS:4582iris_store_vs_state(devinfo, shader);4583break;4584case IRIS_CACHE_TCS:4585iris_store_tcs_state(devinfo, shader);4586break;4587case IRIS_CACHE_TES:4588iris_store_tes_state(devinfo, shader);4589break;4590case IRIS_CACHE_GS:4591iris_store_gs_state(devinfo, shader);4592break;4593case IRIS_CACHE_FS:4594iris_store_fs_state(devinfo, shader);4595break;4596case IRIS_CACHE_CS:4597iris_store_cs_state(devinfo, shader);4598break;4599case IRIS_CACHE_BLORP:4600break;4601}4602}46034604/* ------------------------------------------------------------------- */46054606static const uint32_t push_constant_opcodes[] = {4607[MESA_SHADER_VERTEX] = 21,4608[MESA_SHADER_TESS_CTRL] = 25, /* HS */4609[MESA_SHADER_TESS_EVAL] = 26, /* DS */4610[MESA_SHADER_GEOMETRY] = 22,4611[MESA_SHADER_FRAGMENT] = 23,4612[MESA_SHADER_COMPUTE] = 0,4613};46144615static uint32_t4616use_null_surface(struct iris_batch *batch, struct iris_context *ice)4617{4618struct iris_bo *state_bo = iris_resource_bo(ice->state.unbound_tex.res);46194620iris_use_pinned_bo(batch, state_bo, false, IRIS_DOMAIN_NONE);46214622return ice->state.unbound_tex.offset;4623}46244625static uint32_t4626use_null_fb_surface(struct iris_batch *batch, struct iris_context *ice)4627{4628/* If set_framebuffer_state() was never called, fall back to 1x1x1 */4629if (!ice->state.null_fb.res)4630return use_null_surface(batch, ice);46314632struct iris_bo *state_bo = iris_resource_bo(ice->state.null_fb.res);46334634iris_use_pinned_bo(batch, state_bo, false, IRIS_DOMAIN_NONE);46354636return ice->state.null_fb.offset;4637}46384639static uint32_t4640surf_state_offset_for_aux(struct iris_resource *res,4641unsigned aux_modes,4642enum isl_aux_usage aux_usage)4643{4644assert(aux_modes & (1 << aux_usage));4645return SURFACE_STATE_ALIGNMENT *4646util_bitcount(aux_modes & ((1 << aux_usage) - 1));4647}46484649#if GFX_VER == 94650static void4651surf_state_update_clear_value(struct iris_batch *batch,4652struct iris_resource *res,4653struct iris_state_ref *state,4654unsigned aux_modes,4655enum isl_aux_usage aux_usage)4656{4657struct isl_device *isl_dev = &batch->screen->isl_dev;4658struct iris_bo *state_bo = iris_resource_bo(state->res);4659uint64_t real_offset = state->offset + IRIS_MEMZONE_BINDER_START;4660uint32_t offset_into_bo = real_offset - state_bo->gtt_offset;4661uint32_t clear_offset = offset_into_bo +4662isl_dev->ss.clear_value_offset +4663surf_state_offset_for_aux(res, aux_modes, aux_usage);4664uint32_t *color = res->aux.clear_color.u32;46654666assert(isl_dev->ss.clear_value_size == 16);46674668if (aux_usage == ISL_AUX_USAGE_HIZ) {4669iris_emit_pipe_control_write(batch, "update fast clear value (Z)",4670PIPE_CONTROL_WRITE_IMMEDIATE,4671state_bo, clear_offset, color[0]);4672} else {4673iris_emit_pipe_control_write(batch, "update fast clear color (RG__)",4674PIPE_CONTROL_WRITE_IMMEDIATE,4675state_bo, clear_offset,4676(uint64_t) color[0] |4677(uint64_t) color[1] << 32);4678iris_emit_pipe_control_write(batch, "update fast clear color (__BA)",4679PIPE_CONTROL_WRITE_IMMEDIATE,4680state_bo, clear_offset + 8,4681(uint64_t) color[2] |4682(uint64_t) color[3] << 32);4683}46844685iris_emit_pipe_control_flush(batch,4686"update fast clear: state cache invalidate",4687PIPE_CONTROL_FLUSH_ENABLE |4688PIPE_CONTROL_STATE_CACHE_INVALIDATE);4689}4690#endif46914692static void4693update_clear_value(struct iris_context *ice,4694struct iris_batch *batch,4695struct iris_resource *res,4696struct iris_surface_state *surf_state,4697unsigned all_aux_modes,4698struct isl_view *view)4699{4700UNUSED struct isl_device *isl_dev = &batch->screen->isl_dev;4701UNUSED unsigned aux_modes = all_aux_modes;47024703/* We only need to update the clear color in the surface state for gfx8 and4704* gfx9. Newer gens can read it directly from the clear color state buffer.4705*/4706#if GFX_VER == 94707/* Skip updating the ISL_AUX_USAGE_NONE surface state */4708aux_modes &= ~(1 << ISL_AUX_USAGE_NONE);47094710while (aux_modes) {4711enum isl_aux_usage aux_usage = u_bit_scan(&aux_modes);47124713surf_state_update_clear_value(batch, res, &surf_state->ref,4714all_aux_modes, aux_usage);4715}4716#elif GFX_VER == 84717/* TODO: Could update rather than re-filling */4718alloc_surface_states(surf_state, all_aux_modes);47194720void *map = surf_state->cpu;47214722while (aux_modes) {4723enum isl_aux_usage aux_usage = u_bit_scan(&aux_modes);4724fill_surface_state(isl_dev, map, res, &res->surf, view, aux_usage,47250, 0, 0);4726map += SURFACE_STATE_ALIGNMENT;4727}47284729upload_surface_states(ice->state.surface_uploader, surf_state);4730#endif4731}47324733/**4734* Add a surface to the validation list, as well as the buffer containing4735* the corresponding SURFACE_STATE.4736*4737* Returns the binding table entry (offset to SURFACE_STATE).4738*/4739static uint32_t4740use_surface(struct iris_context *ice,4741struct iris_batch *batch,4742struct pipe_surface *p_surf,4743bool writeable,4744enum isl_aux_usage aux_usage,4745bool is_read_surface,4746enum iris_domain access)4747{4748struct iris_surface *surf = (void *) p_surf;4749struct iris_resource *res = (void *) p_surf->texture;4750uint32_t offset = 0;47514752if (GFX_VER == 8 && is_read_surface && !surf->surface_state_read.ref.res) {4753upload_surface_states(ice->state.surface_uploader,4754&surf->surface_state_read);4755}47564757if (!surf->surface_state.ref.res) {4758upload_surface_states(ice->state.surface_uploader,4759&surf->surface_state);4760}47614762if (res->aux.bo) {4763iris_use_pinned_bo(batch, res->aux.bo, writeable, access);4764if (res->aux.clear_color_bo)4765iris_use_pinned_bo(batch, res->aux.clear_color_bo, false, access);47664767if (memcmp(&res->aux.clear_color, &surf->clear_color,4768sizeof(surf->clear_color)) != 0) {4769update_clear_value(ice, batch, res, &surf->surface_state,4770res->aux.possible_usages, &surf->view);4771if (GFX_VER == 8) {4772update_clear_value(ice, batch, res, &surf->surface_state_read,4773res->aux.possible_usages, &surf->read_view);4774}4775surf->clear_color = res->aux.clear_color;4776}4777}47784779iris_use_pinned_bo(batch, iris_resource_bo(p_surf->texture),4780writeable, access);4781if (GFX_VER == 8 && is_read_surface) {4782iris_use_pinned_bo(batch, iris_resource_bo(surf->surface_state_read.ref.res), false,4783IRIS_DOMAIN_NONE);4784} else {4785iris_use_pinned_bo(batch, iris_resource_bo(surf->surface_state.ref.res), false,4786IRIS_DOMAIN_NONE);4787}47884789offset = (GFX_VER == 8 && is_read_surface)4790? surf->surface_state_read.ref.offset4791: surf->surface_state.ref.offset;47924793return offset +4794surf_state_offset_for_aux(res, res->aux.possible_usages, aux_usage);4795}47964797static uint32_t4798use_sampler_view(struct iris_context *ice,4799struct iris_batch *batch,4800struct iris_sampler_view *isv)4801{4802enum isl_aux_usage aux_usage =4803iris_resource_texture_aux_usage(ice, isv->res, isv->view.format);48044805if (!isv->surface_state.ref.res)4806upload_surface_states(ice->state.surface_uploader, &isv->surface_state);48074808if (isv->res->aux.bo) {4809iris_use_pinned_bo(batch, isv->res->aux.bo,4810false, IRIS_DOMAIN_OTHER_READ);4811if (isv->res->aux.clear_color_bo)4812iris_use_pinned_bo(batch, isv->res->aux.clear_color_bo,4813false, IRIS_DOMAIN_OTHER_READ);4814if (memcmp(&isv->res->aux.clear_color, &isv->clear_color,4815sizeof(isv->clear_color)) != 0) {4816update_clear_value(ice, batch, isv->res, &isv->surface_state,4817isv->res->aux.sampler_usages, &isv->view);4818isv->clear_color = isv->res->aux.clear_color;4819}4820}48214822iris_use_pinned_bo(batch, isv->res->bo, false, IRIS_DOMAIN_OTHER_READ);4823iris_use_pinned_bo(batch, iris_resource_bo(isv->surface_state.ref.res), false,4824IRIS_DOMAIN_NONE);48254826return isv->surface_state.ref.offset +4827surf_state_offset_for_aux(isv->res, isv->res->aux.sampler_usages,4828aux_usage);4829}48304831static uint32_t4832use_ubo_ssbo(struct iris_batch *batch,4833struct iris_context *ice,4834struct pipe_shader_buffer *buf,4835struct iris_state_ref *surf_state,4836bool writable, enum iris_domain access)4837{4838if (!buf->buffer || !surf_state->res)4839return use_null_surface(batch, ice);48404841iris_use_pinned_bo(batch, iris_resource_bo(buf->buffer), writable, access);4842iris_use_pinned_bo(batch, iris_resource_bo(surf_state->res), false,4843IRIS_DOMAIN_NONE);48444845return surf_state->offset;4846}48474848static uint32_t4849use_image(struct iris_batch *batch, struct iris_context *ice,4850struct iris_shader_state *shs, const struct shader_info *info,4851int i)4852{4853struct iris_image_view *iv = &shs->image[i];4854struct iris_resource *res = (void *) iv->base.resource;48554856if (!res)4857return use_null_surface(batch, ice);48584859bool write = iv->base.shader_access & PIPE_IMAGE_ACCESS_WRITE;48604861iris_use_pinned_bo(batch, res->bo, write, IRIS_DOMAIN_NONE);4862iris_use_pinned_bo(batch, iris_resource_bo(iv->surface_state.ref.res),4863false, IRIS_DOMAIN_NONE);48644865if (res->aux.bo)4866iris_use_pinned_bo(batch, res->aux.bo, write, IRIS_DOMAIN_NONE);48674868enum isl_aux_usage aux_usage =4869iris_image_view_aux_usage(ice, &iv->base, info);48704871return iv->surface_state.ref.offset +4872surf_state_offset_for_aux(res, res->aux.possible_usages, aux_usage);4873}48744875#define push_bt_entry(addr) \4876assert(addr >= binder_addr); \4877assert(s < shader->bt.size_bytes / sizeof(uint32_t)); \4878if (!pin_only) bt_map[s++] = (addr) - binder_addr;48794880#define bt_assert(section) \4881if (!pin_only && shader->bt.used_mask[section] != 0) \4882assert(shader->bt.offsets[section] == s);48834884/**4885* Populate the binding table for a given shader stage.4886*4887* This fills out the table of pointers to surfaces required by the shader,4888* and also adds those buffers to the validation list so the kernel can make4889* resident before running our batch.4890*/4891static void4892iris_populate_binding_table(struct iris_context *ice,4893struct iris_batch *batch,4894gl_shader_stage stage,4895bool pin_only)4896{4897const struct iris_binder *binder = &ice->state.binder;4898struct iris_compiled_shader *shader = ice->shaders.prog[stage];4899if (!shader)4900return;49014902struct iris_binding_table *bt = &shader->bt;4903UNUSED struct brw_stage_prog_data *prog_data = shader->prog_data;4904struct iris_shader_state *shs = &ice->state.shaders[stage];4905uint32_t binder_addr = binder->bo->gtt_offset;49064907uint32_t *bt_map = binder->map + binder->bt_offset[stage];4908int s = 0;49094910const struct shader_info *info = iris_get_shader_info(ice, stage);4911if (!info) {4912/* TCS passthrough doesn't need a binding table. */4913assert(stage == MESA_SHADER_TESS_CTRL);4914return;4915}49164917if (stage == MESA_SHADER_COMPUTE &&4918shader->bt.used_mask[IRIS_SURFACE_GROUP_CS_WORK_GROUPS]) {4919/* surface for gl_NumWorkGroups */4920struct iris_state_ref *grid_data = &ice->state.grid_size;4921struct iris_state_ref *grid_state = &ice->state.grid_surf_state;4922iris_use_pinned_bo(batch, iris_resource_bo(grid_data->res), false,4923IRIS_DOMAIN_OTHER_READ);4924iris_use_pinned_bo(batch, iris_resource_bo(grid_state->res), false,4925IRIS_DOMAIN_NONE);4926push_bt_entry(grid_state->offset);4927}49284929if (stage == MESA_SHADER_FRAGMENT) {4930struct pipe_framebuffer_state *cso_fb = &ice->state.framebuffer;4931/* Note that cso_fb->nr_cbufs == fs_key->nr_color_regions. */4932if (cso_fb->nr_cbufs) {4933for (unsigned i = 0; i < cso_fb->nr_cbufs; i++) {4934uint32_t addr;4935if (cso_fb->cbufs[i]) {4936addr = use_surface(ice, batch, cso_fb->cbufs[i], true,4937ice->state.draw_aux_usage[i], false,4938IRIS_DOMAIN_RENDER_WRITE);4939} else {4940addr = use_null_fb_surface(batch, ice);4941}4942push_bt_entry(addr);4943}4944} else if (GFX_VER < 11) {4945uint32_t addr = use_null_fb_surface(batch, ice);4946push_bt_entry(addr);4947}4948}49494950#define foreach_surface_used(index, group) \4951bt_assert(group); \4952for (int index = 0; index < bt->sizes[group]; index++) \4953if (iris_group_index_to_bti(bt, group, index) != \4954IRIS_SURFACE_NOT_USED)49554956foreach_surface_used(i, IRIS_SURFACE_GROUP_RENDER_TARGET_READ) {4957struct pipe_framebuffer_state *cso_fb = &ice->state.framebuffer;4958uint32_t addr;4959if (cso_fb->cbufs[i]) {4960addr = use_surface(ice, batch, cso_fb->cbufs[i],4961false, ice->state.draw_aux_usage[i], true,4962IRIS_DOMAIN_OTHER_READ);4963push_bt_entry(addr);4964}4965}49664967foreach_surface_used(i, IRIS_SURFACE_GROUP_TEXTURE) {4968struct iris_sampler_view *view = shs->textures[i];4969uint32_t addr = view ? use_sampler_view(ice, batch, view)4970: use_null_surface(batch, ice);4971push_bt_entry(addr);4972}49734974foreach_surface_used(i, IRIS_SURFACE_GROUP_IMAGE) {4975uint32_t addr = use_image(batch, ice, shs, info, i);4976push_bt_entry(addr);4977}49784979foreach_surface_used(i, IRIS_SURFACE_GROUP_UBO) {4980uint32_t addr = use_ubo_ssbo(batch, ice, &shs->constbuf[i],4981&shs->constbuf_surf_state[i], false,4982IRIS_DOMAIN_OTHER_READ);4983push_bt_entry(addr);4984}49854986foreach_surface_used(i, IRIS_SURFACE_GROUP_SSBO) {4987uint32_t addr =4988use_ubo_ssbo(batch, ice, &shs->ssbo[i], &shs->ssbo_surf_state[i],4989shs->writable_ssbos & (1u << i), IRIS_DOMAIN_NONE);4990push_bt_entry(addr);4991}49924993#if 04994/* XXX: YUV surfaces not implemented yet */4995bt_assert(plane_start[1], ...);4996bt_assert(plane_start[2], ...);4997#endif4998}49995000static void5001iris_use_optional_res(struct iris_batch *batch,5002struct pipe_resource *res,5003bool writeable,5004enum iris_domain access)5005{5006if (res) {5007struct iris_bo *bo = iris_resource_bo(res);5008iris_use_pinned_bo(batch, bo, writeable, access);5009}5010}50115012static void5013pin_depth_and_stencil_buffers(struct iris_batch *batch,5014struct pipe_surface *zsbuf,5015struct iris_depth_stencil_alpha_state *cso_zsa)5016{5017if (!zsbuf)5018return;50195020struct iris_resource *zres, *sres;5021iris_get_depth_stencil_resources(zsbuf->texture, &zres, &sres);50225023if (zres) {5024const enum iris_domain access = cso_zsa->depth_writes_enabled ?5025IRIS_DOMAIN_DEPTH_WRITE : IRIS_DOMAIN_OTHER_READ;5026iris_use_pinned_bo(batch, zres->bo, cso_zsa->depth_writes_enabled,5027access);5028if (zres->aux.bo) {5029iris_use_pinned_bo(batch, zres->aux.bo,5030cso_zsa->depth_writes_enabled, access);5031}5032}50335034if (sres) {5035const enum iris_domain access = cso_zsa->stencil_writes_enabled ?5036IRIS_DOMAIN_DEPTH_WRITE : IRIS_DOMAIN_OTHER_READ;5037iris_use_pinned_bo(batch, sres->bo, cso_zsa->stencil_writes_enabled,5038access);5039}5040}50415042static uint32_t5043pin_scratch_space(struct iris_context *ice,5044struct iris_batch *batch,5045const struct brw_stage_prog_data *prog_data,5046gl_shader_stage stage)5047{5048uint32_t scratch_addr = 0;50495050if (prog_data->total_scratch > 0) {5051struct iris_bo *scratch_bo =5052iris_get_scratch_space(ice, prog_data->total_scratch, stage);5053iris_use_pinned_bo(batch, scratch_bo, true, IRIS_DOMAIN_NONE);50545055#if GFX_VERx10 >= 1255056const struct iris_state_ref *ref =5057iris_get_scratch_surf(ice, prog_data->total_scratch);5058iris_use_pinned_bo(batch, iris_resource_bo(ref->res),5059false, IRIS_DOMAIN_NONE);5060scratch_addr = ref->offset +5061iris_resource_bo(ref->res)->gtt_offset -5062IRIS_MEMZONE_BINDLESS_START;5063assert((scratch_addr & 0x3f) == 0 && scratch_addr < (1 << 26));5064#else5065scratch_addr = scratch_bo->gtt_offset;5066#endif5067}50685069return scratch_addr;5070}50715072/* ------------------------------------------------------------------- */50735074/**5075* Pin any BOs which were installed by a previous batch, and restored5076* via the hardware logical context mechanism.5077*5078* We don't need to re-emit all state every batch - the hardware context5079* mechanism will save and restore it for us. This includes pointers to5080* various BOs...which won't exist unless we ask the kernel to pin them5081* by adding them to the validation list.5082*5083* We can skip buffers if we've re-emitted those packets, as we're5084* overwriting those stale pointers with new ones, and don't actually5085* refer to the old BOs.5086*/5087static void5088iris_restore_render_saved_bos(struct iris_context *ice,5089struct iris_batch *batch,5090const struct pipe_draw_info *draw)5091{5092struct iris_genx_state *genx = ice->state.genx;50935094const uint64_t clean = ~ice->state.dirty;5095const uint64_t stage_clean = ~ice->state.stage_dirty;50965097if (clean & IRIS_DIRTY_CC_VIEWPORT) {5098iris_use_optional_res(batch, ice->state.last_res.cc_vp, false,5099IRIS_DOMAIN_NONE);5100}51015102if (clean & IRIS_DIRTY_SF_CL_VIEWPORT) {5103iris_use_optional_res(batch, ice->state.last_res.sf_cl_vp, false,5104IRIS_DOMAIN_NONE);5105}51065107if (clean & IRIS_DIRTY_BLEND_STATE) {5108iris_use_optional_res(batch, ice->state.last_res.blend, false,5109IRIS_DOMAIN_NONE);5110}51115112if (clean & IRIS_DIRTY_COLOR_CALC_STATE) {5113iris_use_optional_res(batch, ice->state.last_res.color_calc, false,5114IRIS_DOMAIN_NONE);5115}51165117if (clean & IRIS_DIRTY_SCISSOR_RECT) {5118iris_use_optional_res(batch, ice->state.last_res.scissor, false,5119IRIS_DOMAIN_NONE);5120}51215122if (ice->state.streamout_active && (clean & IRIS_DIRTY_SO_BUFFERS)) {5123for (int i = 0; i < 4; i++) {5124struct iris_stream_output_target *tgt =5125(void *) ice->state.so_target[i];5126if (tgt) {5127iris_use_pinned_bo(batch, iris_resource_bo(tgt->base.buffer),5128true, IRIS_DOMAIN_OTHER_WRITE);5129iris_use_pinned_bo(batch, iris_resource_bo(tgt->offset.res),5130true, IRIS_DOMAIN_OTHER_WRITE);5131}5132}5133}51345135for (int stage = 0; stage <= MESA_SHADER_FRAGMENT; stage++) {5136if (!(stage_clean & (IRIS_STAGE_DIRTY_CONSTANTS_VS << stage)))5137continue;51385139struct iris_shader_state *shs = &ice->state.shaders[stage];5140struct iris_compiled_shader *shader = ice->shaders.prog[stage];51415142if (!shader)5143continue;51445145struct brw_stage_prog_data *prog_data = (void *) shader->prog_data;51465147for (int i = 0; i < 4; i++) {5148const struct brw_ubo_range *range = &prog_data->ubo_ranges[i];51495150if (range->length == 0)5151continue;51525153/* Range block is a binding table index, map back to UBO index. */5154unsigned block_index = iris_bti_to_group_index(5155&shader->bt, IRIS_SURFACE_GROUP_UBO, range->block);5156assert(block_index != IRIS_SURFACE_NOT_USED);51575158struct pipe_shader_buffer *cbuf = &shs->constbuf[block_index];5159struct iris_resource *res = (void *) cbuf->buffer;51605161if (res)5162iris_use_pinned_bo(batch, res->bo, false, IRIS_DOMAIN_OTHER_READ);5163else5164iris_use_pinned_bo(batch, batch->screen->workaround_bo, false,5165IRIS_DOMAIN_OTHER_READ);5166}5167}51685169for (int stage = 0; stage <= MESA_SHADER_FRAGMENT; stage++) {5170if (stage_clean & (IRIS_STAGE_DIRTY_BINDINGS_VS << stage)) {5171/* Re-pin any buffers referred to by the binding table. */5172iris_populate_binding_table(ice, batch, stage, true);5173}5174}51755176for (int stage = 0; stage <= MESA_SHADER_FRAGMENT; stage++) {5177struct iris_shader_state *shs = &ice->state.shaders[stage];5178struct pipe_resource *res = shs->sampler_table.res;5179if (res)5180iris_use_pinned_bo(batch, iris_resource_bo(res), false,5181IRIS_DOMAIN_NONE);5182}51835184for (int stage = 0; stage <= MESA_SHADER_FRAGMENT; stage++) {5185if (stage_clean & (IRIS_STAGE_DIRTY_VS << stage)) {5186struct iris_compiled_shader *shader = ice->shaders.prog[stage];51875188if (shader) {5189struct iris_bo *bo = iris_resource_bo(shader->assembly.res);5190iris_use_pinned_bo(batch, bo, false, IRIS_DOMAIN_NONE);51915192pin_scratch_space(ice, batch, shader->prog_data, stage);5193}5194}5195}51965197if ((clean & IRIS_DIRTY_DEPTH_BUFFER) &&5198(clean & IRIS_DIRTY_WM_DEPTH_STENCIL)) {5199struct pipe_framebuffer_state *cso_fb = &ice->state.framebuffer;5200pin_depth_and_stencil_buffers(batch, cso_fb->zsbuf, ice->state.cso_zsa);5201}52025203iris_use_optional_res(batch, ice->state.last_res.index_buffer, false,5204IRIS_DOMAIN_OTHER_READ);52055206if (clean & IRIS_DIRTY_VERTEX_BUFFERS) {5207uint64_t bound = ice->state.bound_vertex_buffers;5208while (bound) {5209const int i = u_bit_scan64(&bound);5210struct pipe_resource *res = genx->vertex_buffers[i].resource;5211iris_use_pinned_bo(batch, iris_resource_bo(res), false,5212IRIS_DOMAIN_OTHER_READ);5213}5214}5215}52165217static void5218iris_restore_compute_saved_bos(struct iris_context *ice,5219struct iris_batch *batch,5220const struct pipe_grid_info *grid)5221{5222const uint64_t stage_clean = ~ice->state.stage_dirty;52235224const int stage = MESA_SHADER_COMPUTE;5225struct iris_shader_state *shs = &ice->state.shaders[stage];52265227if (stage_clean & IRIS_STAGE_DIRTY_BINDINGS_CS) {5228/* Re-pin any buffers referred to by the binding table. */5229iris_populate_binding_table(ice, batch, stage, true);5230}52315232struct pipe_resource *sampler_res = shs->sampler_table.res;5233if (sampler_res)5234iris_use_pinned_bo(batch, iris_resource_bo(sampler_res), false,5235IRIS_DOMAIN_NONE);52365237if ((stage_clean & IRIS_STAGE_DIRTY_SAMPLER_STATES_CS) &&5238(stage_clean & IRIS_STAGE_DIRTY_BINDINGS_CS) &&5239(stage_clean & IRIS_STAGE_DIRTY_CONSTANTS_CS) &&5240(stage_clean & IRIS_STAGE_DIRTY_CS)) {5241iris_use_optional_res(batch, ice->state.last_res.cs_desc, false,5242IRIS_DOMAIN_NONE);5243}52445245if (stage_clean & IRIS_STAGE_DIRTY_CS) {5246struct iris_compiled_shader *shader = ice->shaders.prog[stage];52475248if (shader) {5249struct iris_bo *bo = iris_resource_bo(shader->assembly.res);5250iris_use_pinned_bo(batch, bo, false, IRIS_DOMAIN_NONE);52515252if (GFX_VERx10 < 125) {5253struct iris_bo *curbe_bo =5254iris_resource_bo(ice->state.last_res.cs_thread_ids);5255iris_use_pinned_bo(batch, curbe_bo, false, IRIS_DOMAIN_NONE);5256}52575258pin_scratch_space(ice, batch, shader->prog_data, stage);5259}5260}5261}52625263/**5264* Possibly emit STATE_BASE_ADDRESS to update Surface State Base Address.5265*/5266static void5267iris_update_surface_base_address(struct iris_batch *batch,5268struct iris_binder *binder)5269{5270if (batch->last_surface_base_address == binder->bo->gtt_offset)5271return;52725273struct isl_device *isl_dev = &batch->screen->isl_dev;5274uint32_t mocs = isl_mocs(isl_dev, 0, false);52755276iris_batch_sync_region_start(batch);52775278flush_before_state_base_change(batch);52795280#if GFX_VER == 125281/* Wa_1607854226:5282*5283* Workaround the non pipelined state not applying in MEDIA/GPGPU pipeline5284* mode by putting the pipeline temporarily in 3D mode..5285*/5286if (batch->name == IRIS_BATCH_COMPUTE)5287emit_pipeline_select(batch, _3D);5288#endif52895290iris_emit_cmd(batch, GENX(STATE_BASE_ADDRESS), sba) {5291sba.SurfaceStateBaseAddressModifyEnable = true;5292sba.SurfaceStateBaseAddress = ro_bo(binder->bo, 0);52935294/* The hardware appears to pay attention to the MOCS fields even5295* if you don't set the "Address Modify Enable" bit for the base.5296*/5297sba.GeneralStateMOCS = mocs;5298sba.StatelessDataPortAccessMOCS = mocs;5299sba.DynamicStateMOCS = mocs;5300sba.IndirectObjectMOCS = mocs;5301sba.InstructionMOCS = mocs;5302sba.SurfaceStateMOCS = mocs;5303#if GFX_VER >= 95304sba.BindlessSurfaceStateMOCS = mocs;5305#endif5306}53075308#if GFX_VER == 125309/* Wa_1607854226:5310*5311* Put the pipeline back into compute mode.5312*/5313if (batch->name == IRIS_BATCH_COMPUTE)5314emit_pipeline_select(batch, GPGPU);5315#endif53165317flush_after_state_base_change(batch);5318iris_batch_sync_region_end(batch);53195320batch->last_surface_base_address = binder->bo->gtt_offset;5321}53225323static inline void5324iris_viewport_zmin_zmax(const struct pipe_viewport_state *vp, bool halfz,5325bool window_space_position, float *zmin, float *zmax)5326{5327if (window_space_position) {5328*zmin = 0.f;5329*zmax = 1.f;5330return;5331}5332util_viewport_zmin_zmax(vp, halfz, zmin, zmax);5333}53345335#if GFX_VER >= 125336void5337genX(invalidate_aux_map_state)(struct iris_batch *batch)5338{5339struct iris_screen *screen = batch->screen;5340void *aux_map_ctx = iris_bufmgr_get_aux_map_context(screen->bufmgr);5341if (!aux_map_ctx)5342return;5343uint32_t aux_map_state_num = intel_aux_map_get_state_num(aux_map_ctx);5344if (batch->last_aux_map_state != aux_map_state_num) {5345/* HSD 1209978178: docs say that before programming the aux table:5346*5347* "Driver must ensure that the engine is IDLE but ensure it doesn't5348* add extra flushes in the case it knows that the engine is already5349* IDLE."5350*5351* An end of pipe sync is needed here, otherwise we see GPU hangs in5352* dEQP-GLES31.functional.copy_image.* tests.5353*/5354iris_emit_end_of_pipe_sync(batch, "Invalidate aux map table",5355PIPE_CONTROL_CS_STALL);53565357/* If the aux-map state number increased, then we need to rewrite the5358* register. Rewriting the register is used to both set the aux-map5359* translation table address, and also to invalidate any previously5360* cached translations.5361*/5362iris_load_register_imm32(batch, GENX(GFX_CCS_AUX_INV_num), 1);5363batch->last_aux_map_state = aux_map_state_num;5364}5365}53665367static void5368init_aux_map_state(struct iris_batch *batch)5369{5370struct iris_screen *screen = batch->screen;5371void *aux_map_ctx = iris_bufmgr_get_aux_map_context(screen->bufmgr);5372if (!aux_map_ctx)5373return;53745375uint64_t base_addr = intel_aux_map_get_base(aux_map_ctx);5376assert(base_addr != 0 && align64(base_addr, 32 * 1024) == base_addr);5377iris_load_register_imm64(batch, GENX(GFX_AUX_TABLE_BASE_ADDR_num),5378base_addr);5379}5380#endif53815382struct push_bos {5383struct {5384struct iris_address addr;5385uint32_t length;5386} buffers[4];5387int buffer_count;5388uint32_t max_length;5389};53905391static void5392setup_constant_buffers(struct iris_context *ice,5393struct iris_batch *batch,5394int stage,5395struct push_bos *push_bos)5396{5397struct iris_shader_state *shs = &ice->state.shaders[stage];5398struct iris_compiled_shader *shader = ice->shaders.prog[stage];5399struct brw_stage_prog_data *prog_data = (void *) shader->prog_data;54005401uint32_t push_range_sum = 0;54025403int n = 0;5404for (int i = 0; i < 4; i++) {5405const struct brw_ubo_range *range = &prog_data->ubo_ranges[i];54065407if (range->length == 0)5408continue;54095410push_range_sum += range->length;54115412if (range->length > push_bos->max_length)5413push_bos->max_length = range->length;54145415/* Range block is a binding table index, map back to UBO index. */5416unsigned block_index = iris_bti_to_group_index(5417&shader->bt, IRIS_SURFACE_GROUP_UBO, range->block);5418assert(block_index != IRIS_SURFACE_NOT_USED);54195420struct pipe_shader_buffer *cbuf = &shs->constbuf[block_index];5421struct iris_resource *res = (void *) cbuf->buffer;54225423assert(cbuf->buffer_offset % 32 == 0);54245425push_bos->buffers[n].length = range->length;5426push_bos->buffers[n].addr =5427res ? ro_bo(res->bo, range->start * 32 + cbuf->buffer_offset)5428: batch->screen->workaround_address;5429n++;5430}54315432/* From the 3DSTATE_CONSTANT_XS and 3DSTATE_CONSTANT_ALL programming notes:5433*5434* "The sum of all four read length fields must be less than or5435* equal to the size of 64."5436*/5437assert(push_range_sum <= 64);54385439push_bos->buffer_count = n;5440}54415442static void5443emit_push_constant_packets(struct iris_context *ice,5444struct iris_batch *batch,5445int stage,5446const struct push_bos *push_bos)5447{5448UNUSED struct isl_device *isl_dev = &batch->screen->isl_dev;5449struct iris_compiled_shader *shader = ice->shaders.prog[stage];5450struct brw_stage_prog_data *prog_data = (void *) shader->prog_data;54515452iris_emit_cmd(batch, GENX(3DSTATE_CONSTANT_VS), pkt) {5453pkt._3DCommandSubOpcode = push_constant_opcodes[stage];5454#if GFX_VER >= 125455pkt.MOCS = isl_mocs(isl_dev, 0, false);5456#endif5457if (prog_data) {5458/* The Skylake PRM contains the following restriction:5459*5460* "The driver must ensure The following case does not occur5461* without a flush to the 3D engine: 3DSTATE_CONSTANT_* with5462* buffer 3 read length equal to zero committed followed by a5463* 3DSTATE_CONSTANT_* with buffer 0 read length not equal to5464* zero committed."5465*5466* To avoid this, we program the buffers in the highest slots.5467* This way, slot 0 is only used if slot 3 is also used.5468*/5469int n = push_bos->buffer_count;5470assert(n <= 4);5471const unsigned shift = 4 - n;5472for (int i = 0; i < n; i++) {5473pkt.ConstantBody.ReadLength[i + shift] =5474push_bos->buffers[i].length;5475pkt.ConstantBody.Buffer[i + shift] = push_bos->buffers[i].addr;5476}5477}5478}5479}54805481#if GFX_VER >= 125482static void5483emit_push_constant_packet_all(struct iris_context *ice,5484struct iris_batch *batch,5485uint32_t shader_mask,5486const struct push_bos *push_bos)5487{5488struct isl_device *isl_dev = &batch->screen->isl_dev;54895490if (!push_bos) {5491iris_emit_cmd(batch, GENX(3DSTATE_CONSTANT_ALL), pc) {5492pc.ShaderUpdateEnable = shader_mask;5493}5494return;5495}54965497const uint32_t n = push_bos->buffer_count;5498const uint32_t max_pointers = 4;5499const uint32_t num_dwords = 2 + 2 * n;5500uint32_t const_all[2 + 2 * max_pointers];5501uint32_t *dw = &const_all[0];55025503assert(n <= max_pointers);5504iris_pack_command(GENX(3DSTATE_CONSTANT_ALL), dw, all) {5505all.DWordLength = num_dwords - 2;5506all.MOCS = isl_mocs(isl_dev, 0, false);5507all.ShaderUpdateEnable = shader_mask;5508all.PointerBufferMask = (1 << n) - 1;5509}5510dw += 2;55115512for (int i = 0; i < n; i++) {5513_iris_pack_state(batch, GENX(3DSTATE_CONSTANT_ALL_DATA),5514dw + i * 2, data) {5515data.PointerToConstantBuffer = push_bos->buffers[i].addr;5516data.ConstantBufferReadLength = push_bos->buffers[i].length;5517}5518}5519iris_batch_emit(batch, const_all, sizeof(uint32_t) * num_dwords);5520}5521#endif55225523static void5524iris_upload_dirty_render_state(struct iris_context *ice,5525struct iris_batch *batch,5526const struct pipe_draw_info *draw)5527{5528const uint64_t dirty = ice->state.dirty;5529const uint64_t stage_dirty = ice->state.stage_dirty;55305531if (!(dirty & IRIS_ALL_DIRTY_FOR_RENDER) &&5532!(stage_dirty & IRIS_ALL_STAGE_DIRTY_FOR_RENDER))5533return;55345535struct iris_genx_state *genx = ice->state.genx;5536struct iris_binder *binder = &ice->state.binder;5537struct brw_wm_prog_data *wm_prog_data = (void *)5538ice->shaders.prog[MESA_SHADER_FRAGMENT]->prog_data;55395540if (dirty & IRIS_DIRTY_CC_VIEWPORT) {5541const struct iris_rasterizer_state *cso_rast = ice->state.cso_rast;5542uint32_t cc_vp_address;55435544/* XXX: could avoid streaming for depth_clip [0,1] case. */5545uint32_t *cc_vp_map =5546stream_state(batch, ice->state.dynamic_uploader,5547&ice->state.last_res.cc_vp,55484 * ice->state.num_viewports *5549GENX(CC_VIEWPORT_length), 32, &cc_vp_address);5550for (int i = 0; i < ice->state.num_viewports; i++) {5551float zmin, zmax;5552iris_viewport_zmin_zmax(&ice->state.viewports[i], cso_rast->clip_halfz,5553ice->state.window_space_position,5554&zmin, &zmax);5555if (cso_rast->depth_clip_near)5556zmin = 0.0;5557if (cso_rast->depth_clip_far)5558zmax = 1.0;55595560iris_pack_state(GENX(CC_VIEWPORT), cc_vp_map, ccv) {5561ccv.MinimumDepth = zmin;5562ccv.MaximumDepth = zmax;5563}55645565cc_vp_map += GENX(CC_VIEWPORT_length);5566}55675568iris_emit_cmd(batch, GENX(3DSTATE_VIEWPORT_STATE_POINTERS_CC), ptr) {5569ptr.CCViewportPointer = cc_vp_address;5570}5571}55725573if (dirty & IRIS_DIRTY_SF_CL_VIEWPORT) {5574struct pipe_framebuffer_state *cso_fb = &ice->state.framebuffer;5575uint32_t sf_cl_vp_address;5576uint32_t *vp_map =5577stream_state(batch, ice->state.dynamic_uploader,5578&ice->state.last_res.sf_cl_vp,55794 * ice->state.num_viewports *5580GENX(SF_CLIP_VIEWPORT_length), 64, &sf_cl_vp_address);55815582for (unsigned i = 0; i < ice->state.num_viewports; i++) {5583const struct pipe_viewport_state *state = &ice->state.viewports[i];5584float gb_xmin, gb_xmax, gb_ymin, gb_ymax;55855586float vp_xmin = viewport_extent(state, 0, -1.0f);5587float vp_xmax = viewport_extent(state, 0, 1.0f);5588float vp_ymin = viewport_extent(state, 1, -1.0f);5589float vp_ymax = viewport_extent(state, 1, 1.0f);55905591intel_calculate_guardband_size(cso_fb->width, cso_fb->height,5592state->scale[0], state->scale[1],5593state->translate[0], state->translate[1],5594&gb_xmin, &gb_xmax, &gb_ymin, &gb_ymax);55955596iris_pack_state(GENX(SF_CLIP_VIEWPORT), vp_map, vp) {5597vp.ViewportMatrixElementm00 = state->scale[0];5598vp.ViewportMatrixElementm11 = state->scale[1];5599vp.ViewportMatrixElementm22 = state->scale[2];5600vp.ViewportMatrixElementm30 = state->translate[0];5601vp.ViewportMatrixElementm31 = state->translate[1];5602vp.ViewportMatrixElementm32 = state->translate[2];5603vp.XMinClipGuardband = gb_xmin;5604vp.XMaxClipGuardband = gb_xmax;5605vp.YMinClipGuardband = gb_ymin;5606vp.YMaxClipGuardband = gb_ymax;5607vp.XMinViewPort = MAX2(vp_xmin, 0);5608vp.XMaxViewPort = MIN2(vp_xmax, cso_fb->width) - 1;5609vp.YMinViewPort = MAX2(vp_ymin, 0);5610vp.YMaxViewPort = MIN2(vp_ymax, cso_fb->height) - 1;5611}56125613vp_map += GENX(SF_CLIP_VIEWPORT_length);5614}56155616iris_emit_cmd(batch, GENX(3DSTATE_VIEWPORT_STATE_POINTERS_SF_CLIP), ptr) {5617ptr.SFClipViewportPointer = sf_cl_vp_address;5618}5619}56205621if (dirty & IRIS_DIRTY_URB) {5622for (int i = MESA_SHADER_VERTEX; i <= MESA_SHADER_GEOMETRY; i++) {5623if (!ice->shaders.prog[i]) {5624ice->shaders.urb.size[i] = 1;5625} else {5626struct brw_vue_prog_data *vue_prog_data =5627(void *) ice->shaders.prog[i]->prog_data;5628ice->shaders.urb.size[i] = vue_prog_data->urb_entry_size;5629}5630assert(ice->shaders.urb.size[i] != 0);5631}56325633intel_get_urb_config(&batch->screen->devinfo,5634batch->screen->l3_config_3d,5635ice->shaders.prog[MESA_SHADER_TESS_EVAL] != NULL,5636ice->shaders.prog[MESA_SHADER_GEOMETRY] != NULL,5637ice->shaders.urb.size,5638ice->shaders.urb.entries,5639ice->shaders.urb.start,5640&ice->state.urb_deref_block_size,5641&ice->shaders.urb.constrained);56425643for (int i = MESA_SHADER_VERTEX; i <= MESA_SHADER_GEOMETRY; i++) {5644iris_emit_cmd(batch, GENX(3DSTATE_URB_VS), urb) {5645urb._3DCommandSubOpcode += i;5646urb.VSURBStartingAddress = ice->shaders.urb.start[i];5647urb.VSURBEntryAllocationSize = ice->shaders.urb.size[i] - 1;5648urb.VSNumberofURBEntries = ice->shaders.urb.entries[i];5649}5650}5651}56525653if (dirty & IRIS_DIRTY_BLEND_STATE) {5654struct iris_blend_state *cso_blend = ice->state.cso_blend;5655struct pipe_framebuffer_state *cso_fb = &ice->state.framebuffer;5656struct iris_depth_stencil_alpha_state *cso_zsa = ice->state.cso_zsa;5657const int header_dwords = GENX(BLEND_STATE_length);56585659/* Always write at least one BLEND_STATE - the final RT message will5660* reference BLEND_STATE[0] even if there aren't color writes. There5661* may still be alpha testing, computed depth, and so on.5662*/5663const int rt_dwords =5664MAX2(cso_fb->nr_cbufs, 1) * GENX(BLEND_STATE_ENTRY_length);56655666uint32_t blend_offset;5667uint32_t *blend_map =5668stream_state(batch, ice->state.dynamic_uploader,5669&ice->state.last_res.blend,56704 * (header_dwords + rt_dwords), 64, &blend_offset);56715672uint32_t blend_state_header;5673iris_pack_state(GENX(BLEND_STATE), &blend_state_header, bs) {5674bs.AlphaTestEnable = cso_zsa->alpha_enabled;5675bs.AlphaTestFunction = translate_compare_func(cso_zsa->alpha_func);5676}56775678blend_map[0] = blend_state_header | cso_blend->blend_state[0];5679memcpy(&blend_map[1], &cso_blend->blend_state[1], 4 * rt_dwords);56805681iris_emit_cmd(batch, GENX(3DSTATE_BLEND_STATE_POINTERS), ptr) {5682ptr.BlendStatePointer = blend_offset;5683ptr.BlendStatePointerValid = true;5684}5685}56865687if (dirty & IRIS_DIRTY_COLOR_CALC_STATE) {5688struct iris_depth_stencil_alpha_state *cso = ice->state.cso_zsa;5689#if GFX_VER == 85690struct pipe_stencil_ref *p_stencil_refs = &ice->state.stencil_ref;5691#endif5692uint32_t cc_offset;5693void *cc_map =5694stream_state(batch, ice->state.dynamic_uploader,5695&ice->state.last_res.color_calc,5696sizeof(uint32_t) * GENX(COLOR_CALC_STATE_length),569764, &cc_offset);5698iris_pack_state(GENX(COLOR_CALC_STATE), cc_map, cc) {5699cc.AlphaTestFormat = ALPHATEST_FLOAT32;5700cc.AlphaReferenceValueAsFLOAT32 = cso->alpha_ref_value;5701cc.BlendConstantColorRed = ice->state.blend_color.color[0];5702cc.BlendConstantColorGreen = ice->state.blend_color.color[1];5703cc.BlendConstantColorBlue = ice->state.blend_color.color[2];5704cc.BlendConstantColorAlpha = ice->state.blend_color.color[3];5705#if GFX_VER == 85706cc.StencilReferenceValue = p_stencil_refs->ref_value[0];5707cc.BackfaceStencilReferenceValue = p_stencil_refs->ref_value[1];5708#endif5709}5710iris_emit_cmd(batch, GENX(3DSTATE_CC_STATE_POINTERS), ptr) {5711ptr.ColorCalcStatePointer = cc_offset;5712ptr.ColorCalcStatePointerValid = true;5713}5714}57155716/* Wa_16040613195717*5718* 3DSTATE_CONSTANT_* needs to be programmed before BTP_*5719*5720* Testing shows that all the 3DSTATE_CONSTANT_XS need to be emitted if5721* any stage has a dirty binding table.5722*/5723const bool emit_const_wa = GFX_VER >= 11 &&5724((dirty & IRIS_DIRTY_RENDER_BUFFER) ||5725(stage_dirty & IRIS_ALL_STAGE_DIRTY_BINDINGS_FOR_RENDER));57265727#if GFX_VER >= 125728uint32_t nobuffer_stages = 0;5729#endif57305731for (int stage = 0; stage <= MESA_SHADER_FRAGMENT; stage++) {5732if (!(stage_dirty & (IRIS_STAGE_DIRTY_CONSTANTS_VS << stage)) &&5733!emit_const_wa)5734continue;57355736struct iris_shader_state *shs = &ice->state.shaders[stage];5737struct iris_compiled_shader *shader = ice->shaders.prog[stage];57385739if (!shader)5740continue;57415742if (shs->sysvals_need_upload)5743upload_sysvals(ice, stage, NULL);57445745struct push_bos push_bos = {};5746setup_constant_buffers(ice, batch, stage, &push_bos);57475748#if GFX_VER >= 125749/* If this stage doesn't have any push constants, emit it later in a5750* single CONSTANT_ALL packet with all the other stages.5751*/5752if (push_bos.buffer_count == 0) {5753nobuffer_stages |= 1 << stage;5754continue;5755}57565757/* The Constant Buffer Read Length field from 3DSTATE_CONSTANT_ALL5758* contains only 5 bits, so we can only use it for buffers smaller than5759* 32.5760*/5761if (push_bos.max_length < 32) {5762emit_push_constant_packet_all(ice, batch, 1 << stage, &push_bos);5763continue;5764}5765#endif5766emit_push_constant_packets(ice, batch, stage, &push_bos);5767}57685769#if GFX_VER >= 125770if (nobuffer_stages)5771emit_push_constant_packet_all(ice, batch, nobuffer_stages, NULL);5772#endif57735774for (int stage = 0; stage <= MESA_SHADER_FRAGMENT; stage++) {5775/* Gfx9 requires 3DSTATE_BINDING_TABLE_POINTERS_XS to be re-emitted5776* in order to commit constants. TODO: Investigate "Disable Gather5777* at Set Shader" to go back to legacy mode...5778*/5779if (stage_dirty & ((IRIS_STAGE_DIRTY_BINDINGS_VS |5780(GFX_VER == 9 ? IRIS_STAGE_DIRTY_CONSTANTS_VS : 0))5781<< stage)) {5782iris_emit_cmd(batch, GENX(3DSTATE_BINDING_TABLE_POINTERS_VS), ptr) {5783ptr._3DCommandSubOpcode = 38 + stage;5784ptr.PointertoVSBindingTable = binder->bt_offset[stage];5785}5786}5787}57885789if (GFX_VER >= 11 && (dirty & IRIS_DIRTY_RENDER_BUFFER)) {5790// XXX: we may want to flag IRIS_DIRTY_MULTISAMPLE (or SAMPLE_MASK?)5791// XXX: see commit 979fc1bc9bcc64027ff2cfafd285676f31b930a657925793/* The PIPE_CONTROL command description says:5794*5795* "Whenever a Binding Table Index (BTI) used by a Render Target5796* Message points to a different RENDER_SURFACE_STATE, SW must issue a5797* Render Target Cache Flush by enabling this bit. When render target5798* flush is set due to new association of BTI, PS Scoreboard Stall bit5799* must be set in this packet."5800*/5801// XXX: does this need to happen at 3DSTATE_BTP_PS time?5802iris_emit_pipe_control_flush(batch, "workaround: RT BTI change [draw]",5803PIPE_CONTROL_RENDER_TARGET_FLUSH |5804PIPE_CONTROL_STALL_AT_SCOREBOARD);5805}58065807for (int stage = 0; stage <= MESA_SHADER_FRAGMENT; stage++) {5808if (stage_dirty & (IRIS_STAGE_DIRTY_BINDINGS_VS << stage)) {5809iris_populate_binding_table(ice, batch, stage, false);5810}5811}58125813for (int stage = 0; stage <= MESA_SHADER_FRAGMENT; stage++) {5814if (!(stage_dirty & (IRIS_STAGE_DIRTY_SAMPLER_STATES_VS << stage)) ||5815!ice->shaders.prog[stage])5816continue;58175818iris_upload_sampler_states(ice, stage);58195820struct iris_shader_state *shs = &ice->state.shaders[stage];5821struct pipe_resource *res = shs->sampler_table.res;5822if (res)5823iris_use_pinned_bo(batch, iris_resource_bo(res), false,5824IRIS_DOMAIN_NONE);58255826iris_emit_cmd(batch, GENX(3DSTATE_SAMPLER_STATE_POINTERS_VS), ptr) {5827ptr._3DCommandSubOpcode = 43 + stage;5828ptr.PointertoVSSamplerState = shs->sampler_table.offset;5829}5830}58315832if (ice->state.need_border_colors)5833iris_use_pinned_bo(batch, ice->state.border_color_pool.bo, false,5834IRIS_DOMAIN_NONE);58355836if (dirty & IRIS_DIRTY_MULTISAMPLE) {5837iris_emit_cmd(batch, GENX(3DSTATE_MULTISAMPLE), ms) {5838ms.PixelLocation =5839ice->state.cso_rast->half_pixel_center ? CENTER : UL_CORNER;5840if (ice->state.framebuffer.samples > 0)5841ms.NumberofMultisamples = ffs(ice->state.framebuffer.samples) - 1;5842}5843}58445845if (dirty & IRIS_DIRTY_SAMPLE_MASK) {5846iris_emit_cmd(batch, GENX(3DSTATE_SAMPLE_MASK), ms) {5847ms.SampleMask = ice->state.sample_mask;5848}5849}58505851for (int stage = 0; stage <= MESA_SHADER_FRAGMENT; stage++) {5852if (!(stage_dirty & (IRIS_STAGE_DIRTY_VS << stage)))5853continue;58545855struct iris_compiled_shader *shader = ice->shaders.prog[stage];58565857if (shader) {5858struct brw_stage_prog_data *prog_data = shader->prog_data;5859struct iris_resource *cache = (void *) shader->assembly.res;5860iris_use_pinned_bo(batch, cache->bo, false, IRIS_DOMAIN_NONE);58615862uint32_t scratch_addr =5863pin_scratch_space(ice, batch, prog_data, stage);58645865if (stage == MESA_SHADER_FRAGMENT) {5866UNUSED struct iris_rasterizer_state *cso = ice->state.cso_rast;5867struct pipe_framebuffer_state *cso_fb = &ice->state.framebuffer;58685869uint32_t ps_state[GENX(3DSTATE_PS_length)] = {0};5870_iris_pack_command(batch, GENX(3DSTATE_PS), ps_state, ps) {5871ps._8PixelDispatchEnable = wm_prog_data->dispatch_8;5872ps._16PixelDispatchEnable = wm_prog_data->dispatch_16;5873ps._32PixelDispatchEnable = wm_prog_data->dispatch_32;58745875/* The docs for 3DSTATE_PS::32 Pixel Dispatch Enable say:5876*5877* "When NUM_MULTISAMPLES = 16 or FORCE_SAMPLE_COUNT = 16,5878* SIMD32 Dispatch must not be enabled for PER_PIXEL dispatch5879* mode."5880*5881* 16x MSAA only exists on Gfx9+, so we can skip this on Gfx8.5882*/5883if (GFX_VER >= 9 && cso_fb->samples == 16 &&5884!wm_prog_data->persample_dispatch) {5885assert(ps._8PixelDispatchEnable || ps._16PixelDispatchEnable);5886ps._32PixelDispatchEnable = false;5887}58885889ps.DispatchGRFStartRegisterForConstantSetupData0 =5890brw_wm_prog_data_dispatch_grf_start_reg(wm_prog_data, ps, 0);5891ps.DispatchGRFStartRegisterForConstantSetupData1 =5892brw_wm_prog_data_dispatch_grf_start_reg(wm_prog_data, ps, 1);5893ps.DispatchGRFStartRegisterForConstantSetupData2 =5894brw_wm_prog_data_dispatch_grf_start_reg(wm_prog_data, ps, 2);58955896ps.KernelStartPointer0 = KSP(shader) +5897brw_wm_prog_data_prog_offset(wm_prog_data, ps, 0);5898ps.KernelStartPointer1 = KSP(shader) +5899brw_wm_prog_data_prog_offset(wm_prog_data, ps, 1);5900ps.KernelStartPointer2 = KSP(shader) +5901brw_wm_prog_data_prog_offset(wm_prog_data, ps, 2);59025903#if GFX_VERx10 >= 1255904ps.ScratchSpaceBuffer = scratch_addr >> 4;5905#else5906ps.ScratchSpaceBasePointer =5907rw_bo(NULL, scratch_addr, IRIS_DOMAIN_NONE);5908#endif5909}59105911uint32_t psx_state[GENX(3DSTATE_PS_EXTRA_length)] = {0};5912iris_pack_command(GENX(3DSTATE_PS_EXTRA), psx_state, psx) {5913#if GFX_VER >= 95914if (!wm_prog_data->uses_sample_mask)5915psx.InputCoverageMaskState = ICMS_NONE;5916else if (wm_prog_data->post_depth_coverage)5917psx.InputCoverageMaskState = ICMS_DEPTH_COVERAGE;5918else if (wm_prog_data->inner_coverage &&5919cso->conservative_rasterization)5920psx.InputCoverageMaskState = ICMS_INNER_CONSERVATIVE;5921else5922psx.InputCoverageMaskState = ICMS_NORMAL;5923#else5924psx.PixelShaderUsesInputCoverageMask =5925wm_prog_data->uses_sample_mask;5926#endif5927}59285929uint32_t *shader_ps = (uint32_t *) shader->derived_data;5930uint32_t *shader_psx = shader_ps + GENX(3DSTATE_PS_length);5931iris_emit_merge(batch, shader_ps, ps_state,5932GENX(3DSTATE_PS_length));5933iris_emit_merge(batch, shader_psx, psx_state,5934GENX(3DSTATE_PS_EXTRA_length));5935} else if (scratch_addr) {5936uint32_t *pkt = (uint32_t *) shader->derived_data;5937switch (stage) {5938case MESA_SHADER_VERTEX: MERGE_SCRATCH_ADDR(3DSTATE_VS); break;5939case MESA_SHADER_TESS_CTRL: MERGE_SCRATCH_ADDR(3DSTATE_HS); break;5940case MESA_SHADER_TESS_EVAL: MERGE_SCRATCH_ADDR(3DSTATE_DS); break;5941case MESA_SHADER_GEOMETRY: MERGE_SCRATCH_ADDR(3DSTATE_GS); break;5942}5943} else {5944iris_batch_emit(batch, shader->derived_data,5945iris_derived_program_state_size(stage));5946}5947} else {5948if (stage == MESA_SHADER_TESS_EVAL) {5949iris_emit_cmd(batch, GENX(3DSTATE_HS), hs);5950iris_emit_cmd(batch, GENX(3DSTATE_TE), te);5951iris_emit_cmd(batch, GENX(3DSTATE_DS), ds);5952} else if (stage == MESA_SHADER_GEOMETRY) {5953iris_emit_cmd(batch, GENX(3DSTATE_GS), gs);5954}5955}5956}59575958if (ice->state.streamout_active) {5959if (dirty & IRIS_DIRTY_SO_BUFFERS) {5960for (int i = 0; i < 4; i++) {5961struct iris_stream_output_target *tgt =5962(void *) ice->state.so_target[i];5963const uint32_t dwords = GENX(3DSTATE_SO_BUFFER_length);5964uint32_t *so_buffers = genx->so_buffers + i * dwords;5965bool zero_offset = false;59665967if (tgt) {5968zero_offset = tgt->zero_offset;5969iris_use_pinned_bo(batch, iris_resource_bo(tgt->base.buffer),5970true, IRIS_DOMAIN_OTHER_WRITE);5971iris_use_pinned_bo(batch, iris_resource_bo(tgt->offset.res),5972true, IRIS_DOMAIN_OTHER_WRITE);5973}59745975if (zero_offset) {5976/* Skip the last DWord which contains "Stream Offset" of5977* 0xFFFFFFFF and instead emit a dword of zero directly.5978*/5979STATIC_ASSERT(GENX(3DSTATE_SO_BUFFER_StreamOffset_start) ==598032 * (dwords - 1));5981const uint32_t zero = 0;5982iris_batch_emit(batch, so_buffers, 4 * (dwords - 1));5983iris_batch_emit(batch, &zero, sizeof(zero));5984tgt->zero_offset = false;5985} else {5986iris_batch_emit(batch, so_buffers, 4 * dwords);5987}5988}5989}59905991if ((dirty & IRIS_DIRTY_SO_DECL_LIST) && ice->state.streamout) {5992uint32_t *decl_list =5993ice->state.streamout + GENX(3DSTATE_STREAMOUT_length);5994iris_batch_emit(batch, decl_list, 4 * ((decl_list[0] & 0xff) + 2));5995}59965997if (dirty & IRIS_DIRTY_STREAMOUT) {5998const struct iris_rasterizer_state *cso_rast = ice->state.cso_rast;59996000uint32_t dynamic_sol[GENX(3DSTATE_STREAMOUT_length)];6001iris_pack_command(GENX(3DSTATE_STREAMOUT), dynamic_sol, sol) {6002sol.SOFunctionEnable = true;6003sol.SOStatisticsEnable = true;60046005sol.RenderingDisable = cso_rast->rasterizer_discard &&6006!ice->state.prims_generated_query_active;6007sol.ReorderMode = cso_rast->flatshade_first ? LEADING : TRAILING;6008}60096010assert(ice->state.streamout);60116012iris_emit_merge(batch, ice->state.streamout, dynamic_sol,6013GENX(3DSTATE_STREAMOUT_length));6014}6015} else {6016if (dirty & IRIS_DIRTY_STREAMOUT) {6017iris_emit_cmd(batch, GENX(3DSTATE_STREAMOUT), sol);6018}6019}60206021if (dirty & IRIS_DIRTY_CLIP) {6022struct iris_rasterizer_state *cso_rast = ice->state.cso_rast;6023struct pipe_framebuffer_state *cso_fb = &ice->state.framebuffer;60246025bool gs_or_tes = ice->shaders.prog[MESA_SHADER_GEOMETRY] ||6026ice->shaders.prog[MESA_SHADER_TESS_EVAL];6027bool points_or_lines = cso_rast->fill_mode_point_or_line ||6028(gs_or_tes ? ice->shaders.output_topology_is_points_or_lines6029: ice->state.prim_is_points_or_lines);60306031uint32_t dynamic_clip[GENX(3DSTATE_CLIP_length)];6032iris_pack_command(GENX(3DSTATE_CLIP), &dynamic_clip, cl) {6033cl.StatisticsEnable = ice->state.statistics_counters_enabled;6034if (cso_rast->rasterizer_discard)6035cl.ClipMode = CLIPMODE_REJECT_ALL;6036else if (ice->state.window_space_position)6037cl.ClipMode = CLIPMODE_ACCEPT_ALL;6038else6039cl.ClipMode = CLIPMODE_NORMAL;60406041cl.PerspectiveDivideDisable = ice->state.window_space_position;6042cl.ViewportXYClipTestEnable = !points_or_lines;60436044if (wm_prog_data->barycentric_interp_modes &6045BRW_BARYCENTRIC_NONPERSPECTIVE_BITS)6046cl.NonPerspectiveBarycentricEnable = true;60476048cl.ForceZeroRTAIndexEnable = cso_fb->layers <= 1;6049cl.MaximumVPIndex = ice->state.num_viewports - 1;6050}6051iris_emit_merge(batch, cso_rast->clip, dynamic_clip,6052ARRAY_SIZE(cso_rast->clip));6053}60546055if (dirty & (IRIS_DIRTY_RASTER | IRIS_DIRTY_URB)) {6056struct iris_rasterizer_state *cso = ice->state.cso_rast;6057iris_batch_emit(batch, cso->raster, sizeof(cso->raster));60586059uint32_t dynamic_sf[GENX(3DSTATE_SF_length)];6060iris_pack_command(GENX(3DSTATE_SF), &dynamic_sf, sf) {6061sf.ViewportTransformEnable = !ice->state.window_space_position;60626063#if GFX_VER >= 126064sf.DerefBlockSize = ice->state.urb_deref_block_size;6065#endif6066}6067iris_emit_merge(batch, cso->sf, dynamic_sf,6068ARRAY_SIZE(dynamic_sf));6069}60706071if (dirty & IRIS_DIRTY_WM) {6072struct iris_rasterizer_state *cso = ice->state.cso_rast;6073uint32_t dynamic_wm[GENX(3DSTATE_WM_length)];60746075iris_pack_command(GENX(3DSTATE_WM), &dynamic_wm, wm) {6076wm.StatisticsEnable = ice->state.statistics_counters_enabled;60776078wm.BarycentricInterpolationMode =6079wm_prog_data->barycentric_interp_modes;60806081if (wm_prog_data->early_fragment_tests)6082wm.EarlyDepthStencilControl = EDSC_PREPS;6083else if (wm_prog_data->has_side_effects)6084wm.EarlyDepthStencilControl = EDSC_PSEXEC;60856086/* We could skip this bit if color writes are enabled. */6087if (wm_prog_data->has_side_effects || wm_prog_data->uses_kill)6088wm.ForceThreadDispatchEnable = ForceON;6089}6090iris_emit_merge(batch, cso->wm, dynamic_wm, ARRAY_SIZE(cso->wm));6091}60926093if (dirty & IRIS_DIRTY_SBE) {6094iris_emit_sbe(batch, ice);6095}60966097if (dirty & IRIS_DIRTY_PS_BLEND) {6098struct iris_blend_state *cso_blend = ice->state.cso_blend;6099struct iris_depth_stencil_alpha_state *cso_zsa = ice->state.cso_zsa;6100const struct shader_info *fs_info =6101iris_get_shader_info(ice, MESA_SHADER_FRAGMENT);61026103uint32_t dynamic_pb[GENX(3DSTATE_PS_BLEND_length)];6104iris_pack_command(GENX(3DSTATE_PS_BLEND), &dynamic_pb, pb) {6105pb.HasWriteableRT = has_writeable_rt(cso_blend, fs_info);6106pb.AlphaTestEnable = cso_zsa->alpha_enabled;61076108/* The dual source blending docs caution against using SRC1 factors6109* when the shader doesn't use a dual source render target write.6110* Empirically, this can lead to GPU hangs, and the results are6111* undefined anyway, so simply disable blending to avoid the hang.6112*/6113pb.ColorBufferBlendEnable = (cso_blend->blend_enables & 1) &&6114(!cso_blend->dual_color_blending || wm_prog_data->dual_src_blend);6115}61166117iris_emit_merge(batch, cso_blend->ps_blend, dynamic_pb,6118ARRAY_SIZE(cso_blend->ps_blend));6119}61206121if (dirty & IRIS_DIRTY_WM_DEPTH_STENCIL) {6122struct iris_depth_stencil_alpha_state *cso = ice->state.cso_zsa;6123#if GFX_VER >= 9 && GFX_VER < 126124struct pipe_stencil_ref *p_stencil_refs = &ice->state.stencil_ref;6125uint32_t stencil_refs[GENX(3DSTATE_WM_DEPTH_STENCIL_length)];6126iris_pack_command(GENX(3DSTATE_WM_DEPTH_STENCIL), &stencil_refs, wmds) {6127wmds.StencilReferenceValue = p_stencil_refs->ref_value[0];6128wmds.BackfaceStencilReferenceValue = p_stencil_refs->ref_value[1];6129}6130iris_emit_merge(batch, cso->wmds, stencil_refs, ARRAY_SIZE(cso->wmds));6131#else6132/* Use modify disable fields which allow us to emit packets6133* directly instead of merging them later.6134*/6135iris_batch_emit(batch, cso->wmds, sizeof(cso->wmds));6136#endif61376138#if GFX_VER >= 126139iris_batch_emit(batch, cso->depth_bounds, sizeof(cso->depth_bounds));6140#endif6141}61426143if (dirty & IRIS_DIRTY_STENCIL_REF) {6144#if GFX_VER >= 126145/* Use modify disable fields which allow us to emit packets6146* directly instead of merging them later.6147*/6148struct pipe_stencil_ref *p_stencil_refs = &ice->state.stencil_ref;6149uint32_t stencil_refs[GENX(3DSTATE_WM_DEPTH_STENCIL_length)];6150iris_pack_command(GENX(3DSTATE_WM_DEPTH_STENCIL), &stencil_refs, wmds) {6151wmds.StencilReferenceValue = p_stencil_refs->ref_value[0];6152wmds.BackfaceStencilReferenceValue = p_stencil_refs->ref_value[1];6153wmds.StencilTestMaskModifyDisable = true;6154wmds.StencilWriteMaskModifyDisable = true;6155wmds.StencilStateModifyDisable = true;6156wmds.DepthStateModifyDisable = true;6157}6158iris_batch_emit(batch, stencil_refs, sizeof(stencil_refs));6159#endif6160}61616162if (dirty & IRIS_DIRTY_SCISSOR_RECT) {6163/* Wa_1409725701:6164* "The viewport-specific state used by the SF unit (SCISSOR_RECT) is6165* stored as an array of up to 16 elements. The location of first6166* element of the array, as specified by Pointer to SCISSOR_RECT,6167* should be aligned to a 64-byte boundary.6168*/6169uint32_t alignment = 64;6170uint32_t scissor_offset =6171emit_state(batch, ice->state.dynamic_uploader,6172&ice->state.last_res.scissor,6173ice->state.scissors,6174sizeof(struct pipe_scissor_state) *6175ice->state.num_viewports, alignment);61766177iris_emit_cmd(batch, GENX(3DSTATE_SCISSOR_STATE_POINTERS), ptr) {6178ptr.ScissorRectPointer = scissor_offset;6179}6180}61816182if (dirty & IRIS_DIRTY_DEPTH_BUFFER) {6183struct iris_depth_buffer_state *cso_z = &ice->state.genx->depth_buffer;61846185/* Do not emit the clear params yets. We need to update the clear value6186* first.6187*/6188uint32_t clear_length = GENX(3DSTATE_CLEAR_PARAMS_length) * 4;6189uint32_t cso_z_size = batch->screen->isl_dev.ds.size - clear_length;;61906191#if GFX_VERx10 == 1206192/* Wa_140104557006193*6194* ISL will change some CHICKEN registers depending on the depth surface6195* format, along with emitting the depth and stencil packets. In that6196* case, we want to do a depth flush and stall, so the pipeline is not6197* using these settings while we change the registers.6198*/6199iris_emit_end_of_pipe_sync(batch,6200"Workaround: Stop pipeline for 14010455700",6201PIPE_CONTROL_DEPTH_STALL |6202PIPE_CONTROL_DEPTH_CACHE_FLUSH);6203#endif62046205iris_batch_emit(batch, cso_z->packets, cso_z_size);6206if (GFX_VER >= 12) {6207/* Wa_14082245816208*6209* Workaround: Gfx12LP Astep only An additional pipe control with6210* post-sync = store dword operation would be required.( w/a is to6211* have an additional pipe control after the stencil state whenever6212* the surface state bits of this state is changing).6213*/6214iris_emit_pipe_control_write(batch, "WA for stencil state",6215PIPE_CONTROL_WRITE_IMMEDIATE,6216batch->screen->workaround_address.bo,6217batch->screen->workaround_address.offset, 0);6218}62196220union isl_color_value clear_value = { .f32 = { 0, } };62216222struct pipe_framebuffer_state *cso_fb = &ice->state.framebuffer;6223if (cso_fb->zsbuf) {6224struct iris_resource *zres, *sres;6225iris_get_depth_stencil_resources(cso_fb->zsbuf->texture,6226&zres, &sres);6227if (zres && zres->aux.bo)6228clear_value = iris_resource_get_clear_color(zres, NULL, NULL);6229}62306231uint32_t clear_params[GENX(3DSTATE_CLEAR_PARAMS_length)];6232iris_pack_command(GENX(3DSTATE_CLEAR_PARAMS), clear_params, clear) {6233clear.DepthClearValueValid = true;6234clear.DepthClearValue = clear_value.f32[0];6235}6236iris_batch_emit(batch, clear_params, clear_length);6237}62386239if (dirty & (IRIS_DIRTY_DEPTH_BUFFER | IRIS_DIRTY_WM_DEPTH_STENCIL)) {6240/* Listen for buffer changes, and also write enable changes. */6241struct pipe_framebuffer_state *cso_fb = &ice->state.framebuffer;6242pin_depth_and_stencil_buffers(batch, cso_fb->zsbuf, ice->state.cso_zsa);6243}62446245if (dirty & IRIS_DIRTY_POLYGON_STIPPLE) {6246iris_emit_cmd(batch, GENX(3DSTATE_POLY_STIPPLE_PATTERN), poly) {6247for (int i = 0; i < 32; i++) {6248poly.PatternRow[i] = ice->state.poly_stipple.stipple[i];6249}6250}6251}62526253if (dirty & IRIS_DIRTY_LINE_STIPPLE) {6254struct iris_rasterizer_state *cso = ice->state.cso_rast;6255iris_batch_emit(batch, cso->line_stipple, sizeof(cso->line_stipple));6256}62576258if (dirty & IRIS_DIRTY_VF_TOPOLOGY) {6259iris_emit_cmd(batch, GENX(3DSTATE_VF_TOPOLOGY), topo) {6260topo.PrimitiveTopologyType =6261translate_prim_type(draw->mode, draw->vertices_per_patch);6262}6263}62646265if (dirty & IRIS_DIRTY_VERTEX_BUFFERS) {6266int count = util_bitcount64(ice->state.bound_vertex_buffers);6267uint64_t dynamic_bound = ice->state.bound_vertex_buffers;62686269if (ice->state.vs_uses_draw_params) {6270assert(ice->draw.draw_params.res);62716272struct iris_vertex_buffer_state *state =6273&(ice->state.genx->vertex_buffers[count]);6274pipe_resource_reference(&state->resource, ice->draw.draw_params.res);6275struct iris_resource *res = (void *) state->resource;62766277iris_pack_state(GENX(VERTEX_BUFFER_STATE), state->state, vb) {6278vb.VertexBufferIndex = count;6279vb.AddressModifyEnable = true;6280vb.BufferPitch = 0;6281vb.BufferSize = res->bo->size - ice->draw.draw_params.offset;6282vb.BufferStartingAddress =6283ro_bo(NULL, res->bo->gtt_offset +6284(int) ice->draw.draw_params.offset);6285vb.MOCS = iris_mocs(res->bo, &batch->screen->isl_dev,6286ISL_SURF_USAGE_VERTEX_BUFFER_BIT);6287#if GFX_VER >= 126288vb.L3BypassDisable = true;6289#endif6290}6291dynamic_bound |= 1ull << count;6292count++;6293}62946295if (ice->state.vs_uses_derived_draw_params) {6296struct iris_vertex_buffer_state *state =6297&(ice->state.genx->vertex_buffers[count]);6298pipe_resource_reference(&state->resource,6299ice->draw.derived_draw_params.res);6300struct iris_resource *res = (void *) ice->draw.derived_draw_params.res;63016302iris_pack_state(GENX(VERTEX_BUFFER_STATE), state->state, vb) {6303vb.VertexBufferIndex = count;6304vb.AddressModifyEnable = true;6305vb.BufferPitch = 0;6306vb.BufferSize =6307res->bo->size - ice->draw.derived_draw_params.offset;6308vb.BufferStartingAddress =6309ro_bo(NULL, res->bo->gtt_offset +6310(int) ice->draw.derived_draw_params.offset);6311vb.MOCS = iris_mocs(res->bo, &batch->screen->isl_dev,6312ISL_SURF_USAGE_VERTEX_BUFFER_BIT);6313#if GFX_VER >= 126314vb.L3BypassDisable = true;6315#endif6316}6317dynamic_bound |= 1ull << count;6318count++;6319}63206321if (count) {6322#if GFX_VER >= 116323/* Gfx11+ doesn't need the cache workaround below */6324uint64_t bound = dynamic_bound;6325while (bound) {6326const int i = u_bit_scan64(&bound);6327iris_use_optional_res(batch, genx->vertex_buffers[i].resource,6328false, IRIS_DOMAIN_OTHER_READ);6329}6330#else6331/* The VF cache designers cut corners, and made the cache key's6332* <VertexBufferIndex, Memory Address> tuple only consider the bottom6333* 32 bits of the address. If you have two vertex buffers which get6334* placed exactly 4 GiB apart and use them in back-to-back draw calls,6335* you can get collisions (even within a single batch).6336*6337* So, we need to do a VF cache invalidate if the buffer for a VB6338* slot slot changes [48:32] address bits from the previous time.6339*/6340unsigned flush_flags = 0;63416342uint64_t bound = dynamic_bound;6343while (bound) {6344const int i = u_bit_scan64(&bound);6345uint16_t high_bits = 0;63466347struct iris_resource *res =6348(void *) genx->vertex_buffers[i].resource;6349if (res) {6350iris_use_pinned_bo(batch, res->bo, false, IRIS_DOMAIN_OTHER_READ);63516352high_bits = res->bo->gtt_offset >> 32ull;6353if (high_bits != ice->state.last_vbo_high_bits[i]) {6354flush_flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE |6355PIPE_CONTROL_CS_STALL;6356ice->state.last_vbo_high_bits[i] = high_bits;6357}6358}6359}63606361if (flush_flags) {6362iris_emit_pipe_control_flush(batch,6363"workaround: VF cache 32-bit key [VB]",6364flush_flags);6365}6366#endif63676368const unsigned vb_dwords = GENX(VERTEX_BUFFER_STATE_length);63696370uint32_t *map =6371iris_get_command_space(batch, 4 * (1 + vb_dwords * count));6372_iris_pack_command(batch, GENX(3DSTATE_VERTEX_BUFFERS), map, vb) {6373vb.DWordLength = (vb_dwords * count + 1) - 2;6374}6375map += 1;63766377bound = dynamic_bound;6378while (bound) {6379const int i = u_bit_scan64(&bound);6380memcpy(map, genx->vertex_buffers[i].state,6381sizeof(uint32_t) * vb_dwords);6382map += vb_dwords;6383}6384}6385}63866387if (dirty & IRIS_DIRTY_VERTEX_ELEMENTS) {6388struct iris_vertex_element_state *cso = ice->state.cso_vertex_elements;6389const unsigned entries = MAX2(cso->count, 1);6390if (!(ice->state.vs_needs_sgvs_element ||6391ice->state.vs_uses_derived_draw_params ||6392ice->state.vs_needs_edge_flag)) {6393iris_batch_emit(batch, cso->vertex_elements, sizeof(uint32_t) *6394(1 + entries * GENX(VERTEX_ELEMENT_STATE_length)));6395} else {6396uint32_t dynamic_ves[1 + 33 * GENX(VERTEX_ELEMENT_STATE_length)];6397const unsigned dyn_count = cso->count +6398ice->state.vs_needs_sgvs_element +6399ice->state.vs_uses_derived_draw_params;64006401iris_pack_command(GENX(3DSTATE_VERTEX_ELEMENTS),6402&dynamic_ves, ve) {6403ve.DWordLength =64041 + GENX(VERTEX_ELEMENT_STATE_length) * dyn_count - 2;6405}6406memcpy(&dynamic_ves[1], &cso->vertex_elements[1],6407(cso->count - ice->state.vs_needs_edge_flag) *6408GENX(VERTEX_ELEMENT_STATE_length) * sizeof(uint32_t));6409uint32_t *ve_pack_dest =6410&dynamic_ves[1 + (cso->count - ice->state.vs_needs_edge_flag) *6411GENX(VERTEX_ELEMENT_STATE_length)];64126413if (ice->state.vs_needs_sgvs_element) {6414uint32_t base_ctrl = ice->state.vs_uses_draw_params ?6415VFCOMP_STORE_SRC : VFCOMP_STORE_0;6416iris_pack_state(GENX(VERTEX_ELEMENT_STATE), ve_pack_dest, ve) {6417ve.Valid = true;6418ve.VertexBufferIndex =6419util_bitcount64(ice->state.bound_vertex_buffers);6420ve.SourceElementFormat = ISL_FORMAT_R32G32_UINT;6421ve.Component0Control = base_ctrl;6422ve.Component1Control = base_ctrl;6423ve.Component2Control = VFCOMP_STORE_0;6424ve.Component3Control = VFCOMP_STORE_0;6425}6426ve_pack_dest += GENX(VERTEX_ELEMENT_STATE_length);6427}6428if (ice->state.vs_uses_derived_draw_params) {6429iris_pack_state(GENX(VERTEX_ELEMENT_STATE), ve_pack_dest, ve) {6430ve.Valid = true;6431ve.VertexBufferIndex =6432util_bitcount64(ice->state.bound_vertex_buffers) +6433ice->state.vs_uses_draw_params;6434ve.SourceElementFormat = ISL_FORMAT_R32G32_UINT;6435ve.Component0Control = VFCOMP_STORE_SRC;6436ve.Component1Control = VFCOMP_STORE_SRC;6437ve.Component2Control = VFCOMP_STORE_0;6438ve.Component3Control = VFCOMP_STORE_0;6439}6440ve_pack_dest += GENX(VERTEX_ELEMENT_STATE_length);6441}6442if (ice->state.vs_needs_edge_flag) {6443for (int i = 0; i < GENX(VERTEX_ELEMENT_STATE_length); i++)6444ve_pack_dest[i] = cso->edgeflag_ve[i];6445}64466447iris_batch_emit(batch, &dynamic_ves, sizeof(uint32_t) *6448(1 + dyn_count * GENX(VERTEX_ELEMENT_STATE_length)));6449}64506451if (!ice->state.vs_needs_edge_flag) {6452iris_batch_emit(batch, cso->vf_instancing, sizeof(uint32_t) *6453entries * GENX(3DSTATE_VF_INSTANCING_length));6454} else {6455assert(cso->count > 0);6456const unsigned edgeflag_index = cso->count - 1;6457uint32_t dynamic_vfi[33 * GENX(3DSTATE_VF_INSTANCING_length)];6458memcpy(&dynamic_vfi[0], cso->vf_instancing, edgeflag_index *6459GENX(3DSTATE_VF_INSTANCING_length) * sizeof(uint32_t));64606461uint32_t *vfi_pack_dest = &dynamic_vfi[0] +6462edgeflag_index * GENX(3DSTATE_VF_INSTANCING_length);6463iris_pack_command(GENX(3DSTATE_VF_INSTANCING), vfi_pack_dest, vi) {6464vi.VertexElementIndex = edgeflag_index +6465ice->state.vs_needs_sgvs_element +6466ice->state.vs_uses_derived_draw_params;6467}6468for (int i = 0; i < GENX(3DSTATE_VF_INSTANCING_length); i++)6469vfi_pack_dest[i] |= cso->edgeflag_vfi[i];64706471iris_batch_emit(batch, &dynamic_vfi[0], sizeof(uint32_t) *6472entries * GENX(3DSTATE_VF_INSTANCING_length));6473}6474}64756476if (dirty & IRIS_DIRTY_VF_SGVS) {6477const struct brw_vs_prog_data *vs_prog_data = (void *)6478ice->shaders.prog[MESA_SHADER_VERTEX]->prog_data;6479struct iris_vertex_element_state *cso = ice->state.cso_vertex_elements;64806481iris_emit_cmd(batch, GENX(3DSTATE_VF_SGVS), sgv) {6482if (vs_prog_data->uses_vertexid) {6483sgv.VertexIDEnable = true;6484sgv.VertexIDComponentNumber = 2;6485sgv.VertexIDElementOffset =6486cso->count - ice->state.vs_needs_edge_flag;6487}64886489if (vs_prog_data->uses_instanceid) {6490sgv.InstanceIDEnable = true;6491sgv.InstanceIDComponentNumber = 3;6492sgv.InstanceIDElementOffset =6493cso->count - ice->state.vs_needs_edge_flag;6494}6495}6496}64976498if (dirty & IRIS_DIRTY_VF) {6499iris_emit_cmd(batch, GENX(3DSTATE_VF), vf) {6500if (draw->primitive_restart) {6501vf.IndexedDrawCutIndexEnable = true;6502vf.CutIndex = draw->restart_index;6503}6504}6505}65066507if (dirty & IRIS_DIRTY_VF_STATISTICS) {6508iris_emit_cmd(batch, GENX(3DSTATE_VF_STATISTICS), vf) {6509vf.StatisticsEnable = true;6510}6511}65126513#if GFX_VER == 86514if (dirty & IRIS_DIRTY_PMA_FIX) {6515bool enable = want_pma_fix(ice);6516genX(update_pma_fix)(ice, batch, enable);6517}6518#endif65196520if (ice->state.current_hash_scale != 1)6521genX(emit_hashing_mode)(ice, batch, UINT_MAX, UINT_MAX, 1);65226523#if GFX_VER >= 126524genX(invalidate_aux_map_state)(batch);6525#endif6526}65276528static void6529iris_upload_render_state(struct iris_context *ice,6530struct iris_batch *batch,6531const struct pipe_draw_info *draw,6532unsigned drawid_offset,6533const struct pipe_draw_indirect_info *indirect,6534const struct pipe_draw_start_count_bias *sc)6535{6536bool use_predicate = ice->state.predicate == IRIS_PREDICATE_STATE_USE_BIT;65376538iris_batch_sync_region_start(batch);65396540/* Always pin the binder. If we're emitting new binding table pointers,6541* we need it. If not, we're probably inheriting old tables via the6542* context, and need it anyway. Since true zero-bindings cases are6543* practically non-existent, just pin it and avoid last_res tracking.6544*/6545iris_use_pinned_bo(batch, ice->state.binder.bo, false,6546IRIS_DOMAIN_NONE);65476548if (!batch->contains_draw) {6549if (GFX_VER == 12) {6550/* Re-emit constants when starting a new batch buffer in order to6551* work around push constant corruption on context switch.6552*6553* XXX - Provide hardware spec quotation when available.6554*/6555ice->state.stage_dirty |= (IRIS_STAGE_DIRTY_CONSTANTS_VS |6556IRIS_STAGE_DIRTY_CONSTANTS_TCS |6557IRIS_STAGE_DIRTY_CONSTANTS_TES |6558IRIS_STAGE_DIRTY_CONSTANTS_GS |6559IRIS_STAGE_DIRTY_CONSTANTS_FS);6560}6561batch->contains_draw = true;6562}65636564if (!batch->contains_draw_with_next_seqno) {6565iris_restore_render_saved_bos(ice, batch, draw);6566batch->contains_draw_with_next_seqno = true;6567}65686569iris_upload_dirty_render_state(ice, batch, draw);65706571if (draw->index_size > 0) {6572unsigned offset;65736574if (draw->has_user_indices) {6575unsigned start_offset = draw->index_size * sc->start;65766577u_upload_data(ice->ctx.const_uploader, start_offset,6578sc->count * draw->index_size, 4,6579(char*)draw->index.user + start_offset,6580&offset, &ice->state.last_res.index_buffer);6581offset -= start_offset;6582} else {6583struct iris_resource *res = (void *) draw->index.resource;6584res->bind_history |= PIPE_BIND_INDEX_BUFFER;65856586pipe_resource_reference(&ice->state.last_res.index_buffer,6587draw->index.resource);6588offset = 0;6589}65906591struct iris_genx_state *genx = ice->state.genx;6592struct iris_bo *bo = iris_resource_bo(ice->state.last_res.index_buffer);65936594uint32_t ib_packet[GENX(3DSTATE_INDEX_BUFFER_length)];6595iris_pack_command(GENX(3DSTATE_INDEX_BUFFER), ib_packet, ib) {6596ib.IndexFormat = draw->index_size >> 1;6597ib.MOCS = iris_mocs(bo, &batch->screen->isl_dev,6598ISL_SURF_USAGE_INDEX_BUFFER_BIT);6599ib.BufferSize = bo->size - offset;6600ib.BufferStartingAddress = ro_bo(NULL, bo->gtt_offset + offset);6601#if GFX_VER >= 126602ib.L3BypassDisable = true;6603#endif6604}66056606if (memcmp(genx->last_index_buffer, ib_packet, sizeof(ib_packet)) != 0) {6607memcpy(genx->last_index_buffer, ib_packet, sizeof(ib_packet));6608iris_batch_emit(batch, ib_packet, sizeof(ib_packet));6609iris_use_pinned_bo(batch, bo, false, IRIS_DOMAIN_OTHER_READ);6610}66116612#if GFX_VER < 116613/* The VF cache key only uses 32-bits, see vertex buffer comment above */6614uint16_t high_bits = bo->gtt_offset >> 32ull;6615if (high_bits != ice->state.last_index_bo_high_bits) {6616iris_emit_pipe_control_flush(batch,6617"workaround: VF cache 32-bit key [IB]",6618PIPE_CONTROL_VF_CACHE_INVALIDATE |6619PIPE_CONTROL_CS_STALL);6620ice->state.last_index_bo_high_bits = high_bits;6621}6622#endif6623}66246625#define _3DPRIM_END_OFFSET 0x24206626#define _3DPRIM_START_VERTEX 0x24306627#define _3DPRIM_VERTEX_COUNT 0x24346628#define _3DPRIM_INSTANCE_COUNT 0x24386629#define _3DPRIM_START_INSTANCE 0x243C6630#define _3DPRIM_BASE_VERTEX 0x244066316632if (indirect && !indirect->count_from_stream_output) {6633if (indirect->indirect_draw_count) {6634use_predicate = true;66356636struct iris_bo *draw_count_bo =6637iris_resource_bo(indirect->indirect_draw_count);6638unsigned draw_count_offset =6639indirect->indirect_draw_count_offset;66406641iris_emit_pipe_control_flush(batch,6642"ensure indirect draw buffer is flushed",6643PIPE_CONTROL_FLUSH_ENABLE);66446645if (ice->state.predicate == IRIS_PREDICATE_STATE_USE_BIT) {6646struct mi_builder b;6647mi_builder_init(&b, &batch->screen->devinfo, batch);66486649/* comparison = draw id < draw count */6650struct mi_value comparison =6651mi_ult(&b, mi_imm(drawid_offset),6652mi_mem32(ro_bo(draw_count_bo, draw_count_offset)));66536654/* predicate = comparison & conditional rendering predicate */6655mi_store(&b, mi_reg32(MI_PREDICATE_RESULT),6656mi_iand(&b, comparison, mi_reg32(CS_GPR(15))));6657} else {6658uint32_t mi_predicate;66596660/* Upload the id of the current primitive to MI_PREDICATE_SRC1. */6661iris_load_register_imm64(batch, MI_PREDICATE_SRC1, drawid_offset);6662/* Upload the current draw count from the draw parameters buffer6663* to MI_PREDICATE_SRC0.6664*/6665iris_load_register_mem32(batch, MI_PREDICATE_SRC0,6666draw_count_bo, draw_count_offset);6667/* Zero the top 32-bits of MI_PREDICATE_SRC0 */6668iris_load_register_imm32(batch, MI_PREDICATE_SRC0 + 4, 0);66696670if (drawid_offset == 0) {6671mi_predicate = MI_PREDICATE | MI_PREDICATE_LOADOP_LOADINV |6672MI_PREDICATE_COMBINEOP_SET |6673MI_PREDICATE_COMPAREOP_SRCS_EQUAL;6674} else {6675/* While draw_index < draw_count the predicate's result will be6676* (draw_index == draw_count) ^ TRUE = TRUE6677* When draw_index == draw_count the result is6678* (TRUE) ^ TRUE = FALSE6679* After this all results will be:6680* (FALSE) ^ FALSE = FALSE6681*/6682mi_predicate = MI_PREDICATE | MI_PREDICATE_LOADOP_LOAD |6683MI_PREDICATE_COMBINEOP_XOR |6684MI_PREDICATE_COMPAREOP_SRCS_EQUAL;6685}6686iris_batch_emit(batch, &mi_predicate, sizeof(uint32_t));6687}6688}6689struct iris_bo *bo = iris_resource_bo(indirect->buffer);6690assert(bo);66916692iris_emit_cmd(batch, GENX(MI_LOAD_REGISTER_MEM), lrm) {6693lrm.RegisterAddress = _3DPRIM_VERTEX_COUNT;6694lrm.MemoryAddress = ro_bo(bo, indirect->offset + 0);6695}6696iris_emit_cmd(batch, GENX(MI_LOAD_REGISTER_MEM), lrm) {6697lrm.RegisterAddress = _3DPRIM_INSTANCE_COUNT;6698lrm.MemoryAddress = ro_bo(bo, indirect->offset + 4);6699}6700iris_emit_cmd(batch, GENX(MI_LOAD_REGISTER_MEM), lrm) {6701lrm.RegisterAddress = _3DPRIM_START_VERTEX;6702lrm.MemoryAddress = ro_bo(bo, indirect->offset + 8);6703}6704if (draw->index_size) {6705iris_emit_cmd(batch, GENX(MI_LOAD_REGISTER_MEM), lrm) {6706lrm.RegisterAddress = _3DPRIM_BASE_VERTEX;6707lrm.MemoryAddress = ro_bo(bo, indirect->offset + 12);6708}6709iris_emit_cmd(batch, GENX(MI_LOAD_REGISTER_MEM), lrm) {6710lrm.RegisterAddress = _3DPRIM_START_INSTANCE;6711lrm.MemoryAddress = ro_bo(bo, indirect->offset + 16);6712}6713} else {6714iris_emit_cmd(batch, GENX(MI_LOAD_REGISTER_MEM), lrm) {6715lrm.RegisterAddress = _3DPRIM_START_INSTANCE;6716lrm.MemoryAddress = ro_bo(bo, indirect->offset + 12);6717}6718iris_emit_cmd(batch, GENX(MI_LOAD_REGISTER_IMM), lri) {6719lri.RegisterOffset = _3DPRIM_BASE_VERTEX;6720lri.DataDWord = 0;6721}6722}6723} else if (indirect && indirect->count_from_stream_output) {6724struct iris_stream_output_target *so =6725(void *) indirect->count_from_stream_output;67266727/* XXX: Replace with actual cache tracking */6728iris_emit_pipe_control_flush(batch,6729"draw count from stream output stall",6730PIPE_CONTROL_CS_STALL);67316732struct mi_builder b;6733mi_builder_init(&b, &batch->screen->devinfo, batch);67346735struct iris_address addr =6736ro_bo(iris_resource_bo(so->offset.res), so->offset.offset);6737struct mi_value offset =6738mi_iadd_imm(&b, mi_mem32(addr), -so->base.buffer_offset);67396740mi_store(&b, mi_reg32(_3DPRIM_VERTEX_COUNT),6741mi_udiv32_imm(&b, offset, so->stride));67426743_iris_emit_lri(batch, _3DPRIM_START_VERTEX, 0);6744_iris_emit_lri(batch, _3DPRIM_BASE_VERTEX, 0);6745_iris_emit_lri(batch, _3DPRIM_START_INSTANCE, 0);6746_iris_emit_lri(batch, _3DPRIM_INSTANCE_COUNT, draw->instance_count);6747}67486749iris_measure_snapshot(ice, batch, INTEL_SNAPSHOT_DRAW, draw, indirect, sc);67506751iris_emit_cmd(batch, GENX(3DPRIMITIVE), prim) {6752prim.VertexAccessType = draw->index_size > 0 ? RANDOM : SEQUENTIAL;6753prim.PredicateEnable = use_predicate;67546755if (indirect) {6756prim.IndirectParameterEnable = true;6757} else {6758prim.StartInstanceLocation = draw->start_instance;6759prim.InstanceCount = draw->instance_count;6760prim.VertexCountPerInstance = sc->count;67616762prim.StartVertexLocation = sc->start;67636764if (draw->index_size) {6765prim.BaseVertexLocation += sc->index_bias;6766}6767}6768}67696770iris_batch_sync_region_end(batch);6771}67726773static void6774iris_load_indirect_location(struct iris_context *ice,6775struct iris_batch *batch,6776const struct pipe_grid_info *grid)6777{6778#define GPGPU_DISPATCHDIMX 0x25006779#define GPGPU_DISPATCHDIMY 0x25046780#define GPGPU_DISPATCHDIMZ 0x250867816782assert(grid->indirect);67836784struct iris_state_ref *grid_size = &ice->state.grid_size;6785struct iris_bo *bo = iris_resource_bo(grid_size->res);6786iris_emit_cmd(batch, GENX(MI_LOAD_REGISTER_MEM), lrm) {6787lrm.RegisterAddress = GPGPU_DISPATCHDIMX;6788lrm.MemoryAddress = ro_bo(bo, grid_size->offset + 0);6789}6790iris_emit_cmd(batch, GENX(MI_LOAD_REGISTER_MEM), lrm) {6791lrm.RegisterAddress = GPGPU_DISPATCHDIMY;6792lrm.MemoryAddress = ro_bo(bo, grid_size->offset + 4);6793}6794iris_emit_cmd(batch, GENX(MI_LOAD_REGISTER_MEM), lrm) {6795lrm.RegisterAddress = GPGPU_DISPATCHDIMZ;6796lrm.MemoryAddress = ro_bo(bo, grid_size->offset + 8);6797}6798}67996800#if GFX_VERx10 >= 12568016802static void6803iris_upload_compute_walker(struct iris_context *ice,6804struct iris_batch *batch,6805const struct pipe_grid_info *grid)6806{6807const uint64_t stage_dirty = ice->state.stage_dirty;6808struct iris_screen *screen = batch->screen;6809const struct intel_device_info *devinfo = &screen->devinfo;6810struct iris_binder *binder = &ice->state.binder;6811struct iris_shader_state *shs = &ice->state.shaders[MESA_SHADER_COMPUTE];6812struct iris_compiled_shader *shader =6813ice->shaders.prog[MESA_SHADER_COMPUTE];6814struct brw_stage_prog_data *prog_data = shader->prog_data;6815struct brw_cs_prog_data *cs_prog_data = (void *) prog_data;6816const struct brw_cs_dispatch_info dispatch =6817brw_cs_get_dispatch_info(devinfo, cs_prog_data, grid->block);68186819if (stage_dirty & IRIS_STAGE_DIRTY_CS) {6820iris_emit_cmd(batch, GENX(CFE_STATE), cfe) {6821cfe.MaximumNumberofThreads =6822devinfo->max_cs_threads * screen->subslice_total - 1;6823if (prog_data->total_scratch > 0) {6824cfe.ScratchSpaceBuffer =6825iris_get_scratch_surf(ice, prog_data->total_scratch)->offset >> 4;6826}6827}6828}68296830if (grid->indirect)6831iris_load_indirect_location(ice, batch, grid);68326833iris_emit_cmd(batch, GENX(COMPUTE_WALKER), cw) {6834cw.IndirectParameterEnable = grid->indirect;6835cw.SIMDSize = dispatch.simd_size / 16;6836cw.LocalXMaximum = grid->block[0] - 1;6837cw.LocalYMaximum = grid->block[1] - 1;6838cw.LocalZMaximum = grid->block[2] - 1;6839cw.ThreadGroupIDXDimension = grid->grid[0];6840cw.ThreadGroupIDYDimension = grid->grid[1];6841cw.ThreadGroupIDZDimension = grid->grid[2];6842cw.ExecutionMask = dispatch.right_mask;68436844cw.InterfaceDescriptor = (struct GENX(INTERFACE_DESCRIPTOR_DATA)) {6845.KernelStartPointer = KSP(shader),6846.NumberofThreadsinGPGPUThreadGroup = dispatch.threads,6847.SharedLocalMemorySize =6848encode_slm_size(GFX_VER, prog_data->total_shared),6849.BarrierEnable = cs_prog_data->uses_barrier,6850.SamplerStatePointer = shs->sampler_table.offset,6851.BindingTablePointer = binder->bt_offset[MESA_SHADER_COMPUTE],6852};68536854assert(brw_cs_push_const_total_size(cs_prog_data, dispatch.threads) == 0);6855}68566857}68586859#else /* #if GFX_VERx10 >= 125 */68606861static void6862iris_upload_gpgpu_walker(struct iris_context *ice,6863struct iris_batch *batch,6864const struct pipe_grid_info *grid)6865{6866const uint64_t stage_dirty = ice->state.stage_dirty;6867struct iris_screen *screen = batch->screen;6868const struct intel_device_info *devinfo = &screen->devinfo;6869struct iris_binder *binder = &ice->state.binder;6870struct iris_shader_state *shs = &ice->state.shaders[MESA_SHADER_COMPUTE];6871struct iris_uncompiled_shader *ish =6872ice->shaders.uncompiled[MESA_SHADER_COMPUTE];6873struct iris_compiled_shader *shader =6874ice->shaders.prog[MESA_SHADER_COMPUTE];6875struct brw_stage_prog_data *prog_data = shader->prog_data;6876struct brw_cs_prog_data *cs_prog_data = (void *) prog_data;6877const struct brw_cs_dispatch_info dispatch =6878brw_cs_get_dispatch_info(devinfo, cs_prog_data, grid->block);68796880if ((stage_dirty & IRIS_STAGE_DIRTY_CS) ||6881cs_prog_data->local_size[0] == 0 /* Variable local group size */) {6882/* The MEDIA_VFE_STATE documentation for Gfx8+ says:6883*6884* "A stalling PIPE_CONTROL is required before MEDIA_VFE_STATE unless6885* the only bits that are changed are scoreboard related: Scoreboard6886* Enable, Scoreboard Type, Scoreboard Mask, Scoreboard Delta. For6887* these scoreboard related states, a MEDIA_STATE_FLUSH is6888* sufficient."6889*/6890iris_emit_pipe_control_flush(batch,6891"workaround: stall before MEDIA_VFE_STATE",6892PIPE_CONTROL_CS_STALL);68936894iris_emit_cmd(batch, GENX(MEDIA_VFE_STATE), vfe) {6895if (prog_data->total_scratch) {6896uint32_t scratch_addr =6897pin_scratch_space(ice, batch, prog_data, MESA_SHADER_COMPUTE);68986899vfe.PerThreadScratchSpace = ffs(prog_data->total_scratch) - 11;6900vfe.ScratchSpaceBasePointer =6901rw_bo(NULL, scratch_addr, IRIS_DOMAIN_NONE);6902}69036904vfe.MaximumNumberofThreads =6905devinfo->max_cs_threads * screen->subslice_total - 1;6906#if GFX_VER < 116907vfe.ResetGatewayTimer =6908Resettingrelativetimerandlatchingtheglobaltimestamp;6909#endif6910#if GFX_VER == 86911vfe.BypassGatewayControl = true;6912#endif6913vfe.NumberofURBEntries = 2;6914vfe.URBEntryAllocationSize = 2;69156916vfe.CURBEAllocationSize =6917ALIGN(cs_prog_data->push.per_thread.regs * dispatch.threads +6918cs_prog_data->push.cross_thread.regs, 2);6919}6920}69216922/* TODO: Combine subgroup-id with cbuf0 so we can push regular uniforms */6923if ((stage_dirty & IRIS_STAGE_DIRTY_CS) ||6924cs_prog_data->local_size[0] == 0 /* Variable local group size */) {6925uint32_t curbe_data_offset = 0;6926assert(cs_prog_data->push.cross_thread.dwords == 0 &&6927cs_prog_data->push.per_thread.dwords == 1 &&6928cs_prog_data->base.param[0] == BRW_PARAM_BUILTIN_SUBGROUP_ID);6929const unsigned push_const_size =6930brw_cs_push_const_total_size(cs_prog_data, dispatch.threads);6931uint32_t *curbe_data_map =6932stream_state(batch, ice->state.dynamic_uploader,6933&ice->state.last_res.cs_thread_ids,6934ALIGN(push_const_size, 64), 64,6935&curbe_data_offset);6936assert(curbe_data_map);6937memset(curbe_data_map, 0x5a, ALIGN(push_const_size, 64));6938iris_fill_cs_push_const_buffer(cs_prog_data, dispatch.threads,6939curbe_data_map);69406941iris_emit_cmd(batch, GENX(MEDIA_CURBE_LOAD), curbe) {6942curbe.CURBETotalDataLength = ALIGN(push_const_size, 64);6943curbe.CURBEDataStartAddress = curbe_data_offset;6944}6945}69466947for (unsigned i = 0; i < IRIS_MAX_GLOBAL_BINDINGS; i++) {6948struct pipe_resource *res = ice->state.global_bindings[i];6949if (!res)6950continue;69516952iris_use_pinned_bo(batch, iris_resource_bo(res),6953true, IRIS_DOMAIN_NONE);6954}69556956if (stage_dirty & (IRIS_STAGE_DIRTY_SAMPLER_STATES_CS |6957IRIS_STAGE_DIRTY_BINDINGS_CS |6958IRIS_STAGE_DIRTY_CONSTANTS_CS |6959IRIS_STAGE_DIRTY_CS)) {6960uint32_t desc[GENX(INTERFACE_DESCRIPTOR_DATA_length)];69616962iris_pack_state(GENX(INTERFACE_DESCRIPTOR_DATA), desc, idd) {6963idd.SharedLocalMemorySize =6964encode_slm_size(GFX_VER, ish->kernel_shared_size);6965idd.KernelStartPointer =6966KSP(shader) + brw_cs_prog_data_prog_offset(cs_prog_data,6967dispatch.simd_size);6968idd.SamplerStatePointer = shs->sampler_table.offset;6969idd.BindingTablePointer = binder->bt_offset[MESA_SHADER_COMPUTE];6970idd.NumberofThreadsinGPGPUThreadGroup = dispatch.threads;6971}69726973for (int i = 0; i < GENX(INTERFACE_DESCRIPTOR_DATA_length); i++)6974desc[i] |= ((uint32_t *) shader->derived_data)[i];69756976iris_emit_cmd(batch, GENX(MEDIA_INTERFACE_DESCRIPTOR_LOAD), load) {6977load.InterfaceDescriptorTotalLength =6978GENX(INTERFACE_DESCRIPTOR_DATA_length) * sizeof(uint32_t);6979load.InterfaceDescriptorDataStartAddress =6980emit_state(batch, ice->state.dynamic_uploader,6981&ice->state.last_res.cs_desc, desc, sizeof(desc), 64);6982}6983}69846985if (grid->indirect)6986iris_load_indirect_location(ice, batch, grid);69876988iris_measure_snapshot(ice, batch, INTEL_SNAPSHOT_COMPUTE, NULL, NULL, NULL);69896990iris_emit_cmd(batch, GENX(GPGPU_WALKER), ggw) {6991ggw.IndirectParameterEnable = grid->indirect != NULL;6992ggw.SIMDSize = dispatch.simd_size / 16;6993ggw.ThreadDepthCounterMaximum = 0;6994ggw.ThreadHeightCounterMaximum = 0;6995ggw.ThreadWidthCounterMaximum = dispatch.threads - 1;6996ggw.ThreadGroupIDXDimension = grid->grid[0];6997ggw.ThreadGroupIDYDimension = grid->grid[1];6998ggw.ThreadGroupIDZDimension = grid->grid[2];6999ggw.RightExecutionMask = dispatch.right_mask;7000ggw.BottomExecutionMask = 0xffffffff;7001}70027003iris_emit_cmd(batch, GENX(MEDIA_STATE_FLUSH), msf);7004}70057006#endif /* #if GFX_VERx10 >= 125 */70077008static void7009iris_upload_compute_state(struct iris_context *ice,7010struct iris_batch *batch,7011const struct pipe_grid_info *grid)7012{7013const uint64_t stage_dirty = ice->state.stage_dirty;7014struct iris_shader_state *shs = &ice->state.shaders[MESA_SHADER_COMPUTE];7015struct iris_compiled_shader *shader =7016ice->shaders.prog[MESA_SHADER_COMPUTE];70177018iris_batch_sync_region_start(batch);70197020/* Always pin the binder. If we're emitting new binding table pointers,7021* we need it. If not, we're probably inheriting old tables via the7022* context, and need it anyway. Since true zero-bindings cases are7023* practically non-existent, just pin it and avoid last_res tracking.7024*/7025iris_use_pinned_bo(batch, ice->state.binder.bo, false, IRIS_DOMAIN_NONE);70267027if (((stage_dirty & IRIS_STAGE_DIRTY_CONSTANTS_CS) &&7028shs->sysvals_need_upload) ||7029shader->kernel_input_size > 0)7030upload_sysvals(ice, MESA_SHADER_COMPUTE, grid);70317032if (stage_dirty & IRIS_STAGE_DIRTY_BINDINGS_CS)7033iris_populate_binding_table(ice, batch, MESA_SHADER_COMPUTE, false);70347035if (stage_dirty & IRIS_STAGE_DIRTY_SAMPLER_STATES_CS)7036iris_upload_sampler_states(ice, MESA_SHADER_COMPUTE);70377038iris_use_optional_res(batch, shs->sampler_table.res, false,7039IRIS_DOMAIN_NONE);7040iris_use_pinned_bo(batch, iris_resource_bo(shader->assembly.res), false,7041IRIS_DOMAIN_NONE);70427043if (ice->state.need_border_colors)7044iris_use_pinned_bo(batch, ice->state.border_color_pool.bo, false,7045IRIS_DOMAIN_NONE);70467047#if GFX_VER >= 127048genX(invalidate_aux_map_state)(batch);7049#endif70507051#if GFX_VERx10 >= 1257052iris_upload_compute_walker(ice, batch, grid);7053#else7054iris_upload_gpgpu_walker(ice, batch, grid);7055#endif70567057if (!batch->contains_draw_with_next_seqno) {7058iris_restore_compute_saved_bos(ice, batch, grid);7059batch->contains_draw_with_next_seqno = batch->contains_draw = true;7060}70617062iris_batch_sync_region_end(batch);7063}70647065/**7066* State module teardown.7067*/7068static void7069iris_destroy_state(struct iris_context *ice)7070{7071struct iris_genx_state *genx = ice->state.genx;70727073pipe_resource_reference(&ice->draw.draw_params.res, NULL);7074pipe_resource_reference(&ice->draw.derived_draw_params.res, NULL);70757076/* Loop over all VBOs, including ones for draw parameters */7077for (unsigned i = 0; i < ARRAY_SIZE(genx->vertex_buffers); i++) {7078pipe_resource_reference(&genx->vertex_buffers[i].resource, NULL);7079}70807081free(ice->state.genx);70827083for (int i = 0; i < 4; i++) {7084pipe_so_target_reference(&ice->state.so_target[i], NULL);7085}70867087for (unsigned i = 0; i < ice->state.framebuffer.nr_cbufs; i++) {7088pipe_surface_reference(&ice->state.framebuffer.cbufs[i], NULL);7089}7090pipe_surface_reference(&ice->state.framebuffer.zsbuf, NULL);70917092for (int stage = 0; stage < MESA_SHADER_STAGES; stage++) {7093struct iris_shader_state *shs = &ice->state.shaders[stage];7094pipe_resource_reference(&shs->sampler_table.res, NULL);7095for (int i = 0; i < PIPE_MAX_CONSTANT_BUFFERS; i++) {7096pipe_resource_reference(&shs->constbuf[i].buffer, NULL);7097pipe_resource_reference(&shs->constbuf_surf_state[i].res, NULL);7098}7099for (int i = 0; i < PIPE_MAX_SHADER_IMAGES; i++) {7100pipe_resource_reference(&shs->image[i].base.resource, NULL);7101pipe_resource_reference(&shs->image[i].surface_state.ref.res, NULL);7102free(shs->image[i].surface_state.cpu);7103}7104for (int i = 0; i < PIPE_MAX_SHADER_BUFFERS; i++) {7105pipe_resource_reference(&shs->ssbo[i].buffer, NULL);7106pipe_resource_reference(&shs->ssbo_surf_state[i].res, NULL);7107}7108for (int i = 0; i < IRIS_MAX_TEXTURE_SAMPLERS; i++) {7109pipe_sampler_view_reference((struct pipe_sampler_view **)7110&shs->textures[i], NULL);7111}7112}71137114pipe_resource_reference(&ice->state.grid_size.res, NULL);7115pipe_resource_reference(&ice->state.grid_surf_state.res, NULL);71167117pipe_resource_reference(&ice->state.null_fb.res, NULL);7118pipe_resource_reference(&ice->state.unbound_tex.res, NULL);71197120pipe_resource_reference(&ice->state.last_res.cc_vp, NULL);7121pipe_resource_reference(&ice->state.last_res.sf_cl_vp, NULL);7122pipe_resource_reference(&ice->state.last_res.color_calc, NULL);7123pipe_resource_reference(&ice->state.last_res.scissor, NULL);7124pipe_resource_reference(&ice->state.last_res.blend, NULL);7125pipe_resource_reference(&ice->state.last_res.index_buffer, NULL);7126pipe_resource_reference(&ice->state.last_res.cs_thread_ids, NULL);7127pipe_resource_reference(&ice->state.last_res.cs_desc, NULL);7128}71297130/* ------------------------------------------------------------------- */71317132static void7133iris_rebind_buffer(struct iris_context *ice,7134struct iris_resource *res)7135{7136struct pipe_context *ctx = &ice->ctx;7137struct iris_genx_state *genx = ice->state.genx;71387139assert(res->base.b.target == PIPE_BUFFER);71407141/* Buffers can't be framebuffer attachments, nor display related,7142* and we don't have upstream Clover support.7143*/7144assert(!(res->bind_history & (PIPE_BIND_DEPTH_STENCIL |7145PIPE_BIND_RENDER_TARGET |7146PIPE_BIND_BLENDABLE |7147PIPE_BIND_DISPLAY_TARGET |7148PIPE_BIND_CURSOR |7149PIPE_BIND_COMPUTE_RESOURCE |7150PIPE_BIND_GLOBAL)));71517152if (res->bind_history & PIPE_BIND_VERTEX_BUFFER) {7153uint64_t bound_vbs = ice->state.bound_vertex_buffers;7154while (bound_vbs) {7155const int i = u_bit_scan64(&bound_vbs);7156struct iris_vertex_buffer_state *state = &genx->vertex_buffers[i];71577158/* Update the CPU struct */7159STATIC_ASSERT(GENX(VERTEX_BUFFER_STATE_BufferStartingAddress_start) == 32);7160STATIC_ASSERT(GENX(VERTEX_BUFFER_STATE_BufferStartingAddress_bits) == 64);7161uint64_t *addr = (uint64_t *) &state->state[1];7162struct iris_bo *bo = iris_resource_bo(state->resource);71637164if (*addr != bo->gtt_offset + state->offset) {7165*addr = bo->gtt_offset + state->offset;7166ice->state.dirty |= IRIS_DIRTY_VERTEX_BUFFERS;7167}7168}7169}71707171/* We don't need to handle PIPE_BIND_INDEX_BUFFER here: we re-emit7172* the 3DSTATE_INDEX_BUFFER packet whenever the address changes.7173*7174* There is also no need to handle these:7175* - PIPE_BIND_COMMAND_ARGS_BUFFER (emitted for every indirect draw)7176* - PIPE_BIND_QUERY_BUFFER (no persistent state references)7177*/71787179if (res->bind_history & PIPE_BIND_STREAM_OUTPUT) {7180uint32_t *so_buffers = genx->so_buffers;7181for (unsigned i = 0; i < 4; i++,7182so_buffers += GENX(3DSTATE_SO_BUFFER_length)) {71837184/* There are no other fields in bits 127:64 */7185uint64_t *addr = (uint64_t *) &so_buffers[2];7186STATIC_ASSERT(GENX(3DSTATE_SO_BUFFER_SurfaceBaseAddress_start) == 66);7187STATIC_ASSERT(GENX(3DSTATE_SO_BUFFER_SurfaceBaseAddress_bits) == 46);71887189struct pipe_stream_output_target *tgt = ice->state.so_target[i];7190if (tgt) {7191struct iris_bo *bo = iris_resource_bo(tgt->buffer);7192if (*addr != bo->gtt_offset + tgt->buffer_offset) {7193*addr = bo->gtt_offset + tgt->buffer_offset;7194ice->state.dirty |= IRIS_DIRTY_SO_BUFFERS;7195}7196}7197}7198}71997200for (int s = MESA_SHADER_VERTEX; s < MESA_SHADER_STAGES; s++) {7201struct iris_shader_state *shs = &ice->state.shaders[s];7202enum pipe_shader_type p_stage = stage_to_pipe(s);72037204if (!(res->bind_stages & (1 << s)))7205continue;72067207if (res->bind_history & PIPE_BIND_CONSTANT_BUFFER) {7208/* Skip constant buffer 0, it's for regular uniforms, not UBOs */7209uint32_t bound_cbufs = shs->bound_cbufs & ~1u;7210while (bound_cbufs) {7211const int i = u_bit_scan(&bound_cbufs);7212struct pipe_shader_buffer *cbuf = &shs->constbuf[i];7213struct iris_state_ref *surf_state = &shs->constbuf_surf_state[i];72147215if (res->bo == iris_resource_bo(cbuf->buffer)) {7216pipe_resource_reference(&surf_state->res, NULL);7217ice->state.stage_dirty |= IRIS_STAGE_DIRTY_CONSTANTS_VS << s;7218}7219}7220}72217222if (res->bind_history & PIPE_BIND_SHADER_BUFFER) {7223uint32_t bound_ssbos = shs->bound_ssbos;7224while (bound_ssbos) {7225const int i = u_bit_scan(&bound_ssbos);7226struct pipe_shader_buffer *ssbo = &shs->ssbo[i];72277228if (res->bo == iris_resource_bo(ssbo->buffer)) {7229struct pipe_shader_buffer buf = {7230.buffer = &res->base.b,7231.buffer_offset = ssbo->buffer_offset,7232.buffer_size = ssbo->buffer_size,7233};7234iris_set_shader_buffers(ctx, p_stage, i, 1, &buf,7235(shs->writable_ssbos >> i) & 1);7236}7237}7238}72397240if (res->bind_history & PIPE_BIND_SAMPLER_VIEW) {7241uint32_t bound_sampler_views = shs->bound_sampler_views;7242while (bound_sampler_views) {7243const int i = u_bit_scan(&bound_sampler_views);7244struct iris_sampler_view *isv = shs->textures[i];7245struct iris_bo *bo = isv->res->bo;72467247if (update_surface_state_addrs(ice->state.surface_uploader,7248&isv->surface_state, bo)) {7249ice->state.stage_dirty |= IRIS_STAGE_DIRTY_BINDINGS_VS << s;7250}7251}7252}72537254if (res->bind_history & PIPE_BIND_SHADER_IMAGE) {7255uint32_t bound_image_views = shs->bound_image_views;7256while (bound_image_views) {7257const int i = u_bit_scan(&bound_image_views);7258struct iris_image_view *iv = &shs->image[i];7259struct iris_bo *bo = iris_resource_bo(iv->base.resource);72607261if (update_surface_state_addrs(ice->state.surface_uploader,7262&iv->surface_state, bo)) {7263ice->state.stage_dirty |= IRIS_STAGE_DIRTY_BINDINGS_VS << s;7264}7265}7266}7267}7268}72697270/* ------------------------------------------------------------------- */72717272/**7273* Introduce a batch synchronization boundary, and update its cache coherency7274* status to reflect the execution of a PIPE_CONTROL command with the7275* specified flags.7276*/7277static void7278batch_mark_sync_for_pipe_control(struct iris_batch *batch, uint32_t flags)7279{7280iris_batch_sync_boundary(batch);72817282if ((flags & PIPE_CONTROL_CS_STALL)) {7283if ((flags & PIPE_CONTROL_RENDER_TARGET_FLUSH))7284iris_batch_mark_flush_sync(batch, IRIS_DOMAIN_RENDER_WRITE);72857286if ((flags & PIPE_CONTROL_DEPTH_CACHE_FLUSH))7287iris_batch_mark_flush_sync(batch, IRIS_DOMAIN_DEPTH_WRITE);72887289if ((flags & PIPE_CONTROL_FLUSH_ENABLE))7290iris_batch_mark_flush_sync(batch, IRIS_DOMAIN_OTHER_WRITE);72917292if ((flags & (PIPE_CONTROL_CACHE_FLUSH_BITS |7293PIPE_CONTROL_STALL_AT_SCOREBOARD)))7294iris_batch_mark_flush_sync(batch, IRIS_DOMAIN_OTHER_READ);7295}72967297if ((flags & PIPE_CONTROL_RENDER_TARGET_FLUSH))7298iris_batch_mark_invalidate_sync(batch, IRIS_DOMAIN_RENDER_WRITE);72997300if ((flags & PIPE_CONTROL_DEPTH_CACHE_FLUSH))7301iris_batch_mark_invalidate_sync(batch, IRIS_DOMAIN_DEPTH_WRITE);73027303if ((flags & PIPE_CONTROL_FLUSH_ENABLE))7304iris_batch_mark_invalidate_sync(batch, IRIS_DOMAIN_OTHER_WRITE);73057306if ((flags & PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE) &&7307(flags & PIPE_CONTROL_CONST_CACHE_INVALIDATE))7308iris_batch_mark_invalidate_sync(batch, IRIS_DOMAIN_OTHER_READ);7309}73107311static unsigned7312flags_to_post_sync_op(uint32_t flags)7313{7314if (flags & PIPE_CONTROL_WRITE_IMMEDIATE)7315return WriteImmediateData;73167317if (flags & PIPE_CONTROL_WRITE_DEPTH_COUNT)7318return WritePSDepthCount;73197320if (flags & PIPE_CONTROL_WRITE_TIMESTAMP)7321return WriteTimestamp;73227323return 0;7324}73257326/**7327* Do the given flags have a Post Sync or LRI Post Sync operation?7328*/7329static enum pipe_control_flags7330get_post_sync_flags(enum pipe_control_flags flags)7331{7332flags &= PIPE_CONTROL_WRITE_IMMEDIATE |7333PIPE_CONTROL_WRITE_DEPTH_COUNT |7334PIPE_CONTROL_WRITE_TIMESTAMP |7335PIPE_CONTROL_LRI_POST_SYNC_OP;73367337/* Only one "Post Sync Op" is allowed, and it's mutually exclusive with7338* "LRI Post Sync Operation". So more than one bit set would be illegal.7339*/7340assert(util_bitcount(flags) <= 1);73417342return flags;7343}73447345#define IS_COMPUTE_PIPELINE(batch) (batch->name == IRIS_BATCH_COMPUTE)73467347/**7348* Emit a series of PIPE_CONTROL commands, taking into account any7349* workarounds necessary to actually accomplish the caller's request.7350*7351* Unless otherwise noted, spec quotations in this function come from:7352*7353* Synchronization of the 3D Pipeline > PIPE_CONTROL Command > Programming7354* Restrictions for PIPE_CONTROL.7355*7356* You should not use this function directly. Use the helpers in7357* iris_pipe_control.c instead, which may split the pipe control further.7358*/7359static void7360iris_emit_raw_pipe_control(struct iris_batch *batch,7361const char *reason,7362uint32_t flags,7363struct iris_bo *bo,7364uint32_t offset,7365uint64_t imm)7366{7367UNUSED const struct intel_device_info *devinfo = &batch->screen->devinfo;7368enum pipe_control_flags post_sync_flags = get_post_sync_flags(flags);7369enum pipe_control_flags non_lri_post_sync_flags =7370post_sync_flags & ~PIPE_CONTROL_LRI_POST_SYNC_OP;73717372/* Recursive PIPE_CONTROL workarounds --------------------------------7373* (http://knowyourmeme.com/memes/xzibit-yo-dawg)7374*7375* We do these first because we want to look at the original operation,7376* rather than any workarounds we set.7377*/7378if (GFX_VER == 9 && (flags & PIPE_CONTROL_VF_CACHE_INVALIDATE)) {7379/* The PIPE_CONTROL "VF Cache Invalidation Enable" bit description7380* lists several workarounds:7381*7382* "Project: SKL, KBL, BXT7383*7384* If the VF Cache Invalidation Enable is set to a 1 in a7385* PIPE_CONTROL, a separate Null PIPE_CONTROL, all bitfields7386* sets to 0, with the VF Cache Invalidation Enable set to 07387* needs to be sent prior to the PIPE_CONTROL with VF Cache7388* Invalidation Enable set to a 1."7389*/7390iris_emit_raw_pipe_control(batch,7391"workaround: recursive VF cache invalidate",73920, NULL, 0, 0);7393}73947395/* Wa_1409226450, Wait for EU to be idle before pipe control which7396* invalidates the instruction cache7397*/7398if (GFX_VER == 12 && (flags & PIPE_CONTROL_INSTRUCTION_INVALIDATE)) {7399iris_emit_raw_pipe_control(batch,7400"workaround: CS stall before instruction "7401"cache invalidate",7402PIPE_CONTROL_CS_STALL |7403PIPE_CONTROL_STALL_AT_SCOREBOARD, bo, offset,7404imm);7405}74067407if ((GFX_VER == 9 || (GFX_VER == 12 && devinfo->revision == 0 /* A0*/)) &&7408IS_COMPUTE_PIPELINE(batch) && post_sync_flags) {7409/* Project: SKL / Argument: LRI Post Sync Operation [23]7410*7411* "PIPECONTROL command with “Command Streamer Stall Enable” must be7412* programmed prior to programming a PIPECONTROL command with "LRI7413* Post Sync Operation" in GPGPU mode of operation (i.e when7414* PIPELINE_SELECT command is set to GPGPU mode of operation)."7415*7416* The same text exists a few rows below for Post Sync Op.7417*7418* On Gfx12 this is Wa_1607156449.7419*/7420iris_emit_raw_pipe_control(batch,7421"workaround: CS stall before gpgpu post-sync",7422PIPE_CONTROL_CS_STALL, bo, offset, imm);7423}74247425/* "Flush Types" workarounds ---------------------------------------------7426* We do these now because they may add post-sync operations or CS stalls.7427*/74287429if (GFX_VER < 11 && flags & PIPE_CONTROL_VF_CACHE_INVALIDATE) {7430/* Project: BDW, SKL+ (stopping at CNL) / Argument: VF Invalidate7431*7432* "'Post Sync Operation' must be enabled to 'Write Immediate Data' or7433* 'Write PS Depth Count' or 'Write Timestamp'."7434*/7435if (!bo) {7436flags |= PIPE_CONTROL_WRITE_IMMEDIATE;7437post_sync_flags |= PIPE_CONTROL_WRITE_IMMEDIATE;7438non_lri_post_sync_flags |= PIPE_CONTROL_WRITE_IMMEDIATE;7439bo = batch->screen->workaround_address.bo;7440offset = batch->screen->workaround_address.offset;7441}7442}74437444if (flags & PIPE_CONTROL_DEPTH_STALL) {7445/* From the PIPE_CONTROL instruction table, bit 13 (Depth Stall Enable):7446*7447* "This bit must be DISABLED for operations other than writing7448* PS_DEPTH_COUNT."7449*7450* This seems like nonsense. An Ivybridge workaround requires us to7451* emit a PIPE_CONTROL with a depth stall and write immediate post-sync7452* operation. Gfx8+ requires us to emit depth stalls and depth cache7453* flushes together. So, it's hard to imagine this means anything other7454* than "we originally intended this to be used for PS_DEPTH_COUNT".7455*7456* We ignore the supposed restriction and do nothing.7457*/7458}74597460if (flags & (PIPE_CONTROL_RENDER_TARGET_FLUSH |7461PIPE_CONTROL_STALL_AT_SCOREBOARD)) {7462/* From the PIPE_CONTROL instruction table, bit 12 and bit 1:7463*7464* "This bit must be DISABLED for End-of-pipe (Read) fences,7465* PS_DEPTH_COUNT or TIMESTAMP queries."7466*7467* TODO: Implement end-of-pipe checking.7468*/7469assert(!(post_sync_flags & (PIPE_CONTROL_WRITE_DEPTH_COUNT |7470PIPE_CONTROL_WRITE_TIMESTAMP)));7471}74727473if (GFX_VER < 11 && (flags & PIPE_CONTROL_STALL_AT_SCOREBOARD)) {7474/* From the PIPE_CONTROL instruction table, bit 1:7475*7476* "This bit is ignored if Depth Stall Enable is set.7477* Further, the render cache is not flushed even if Write Cache7478* Flush Enable bit is set."7479*7480* We assert that the caller doesn't do this combination, to try and7481* prevent mistakes. It shouldn't hurt the GPU, though.7482*7483* We skip this check on Gfx11+ as the "Stall at Pixel Scoreboard"7484* and "Render Target Flush" combo is explicitly required for BTI7485* update workarounds.7486*/7487assert(!(flags & (PIPE_CONTROL_DEPTH_STALL |7488PIPE_CONTROL_RENDER_TARGET_FLUSH)));7489}74907491/* PIPE_CONTROL page workarounds ------------------------------------- */74927493if (GFX_VER <= 8 && (flags & PIPE_CONTROL_STATE_CACHE_INVALIDATE)) {7494/* From the PIPE_CONTROL page itself:7495*7496* "IVB, HSW, BDW7497* Restriction: Pipe_control with CS-stall bit set must be issued7498* before a pipe-control command that has the State Cache7499* Invalidate bit set."7500*/7501flags |= PIPE_CONTROL_CS_STALL;7502}75037504if (flags & PIPE_CONTROL_FLUSH_LLC) {7505/* From the PIPE_CONTROL instruction table, bit 26 (Flush LLC):7506*7507* "Project: ALL7508* SW must always program Post-Sync Operation to "Write Immediate7509* Data" when Flush LLC is set."7510*7511* For now, we just require the caller to do it.7512*/7513assert(flags & PIPE_CONTROL_WRITE_IMMEDIATE);7514}75157516/* "Post-Sync Operation" workarounds -------------------------------- */75177518/* Project: All / Argument: Global Snapshot Count Reset [19]7519*7520* "This bit must not be exercised on any product.7521* Requires stall bit ([20] of DW1) set."7522*7523* We don't use this, so we just assert that it isn't used. The7524* PIPE_CONTROL instruction page indicates that they intended this7525* as a debug feature and don't think it is useful in production,7526* but it may actually be usable, should we ever want to.7527*/7528assert((flags & PIPE_CONTROL_GLOBAL_SNAPSHOT_COUNT_RESET) == 0);75297530if (flags & (PIPE_CONTROL_MEDIA_STATE_CLEAR |7531PIPE_CONTROL_INDIRECT_STATE_POINTERS_DISABLE)) {7532/* Project: All / Arguments:7533*7534* - Generic Media State Clear [16]7535* - Indirect State Pointers Disable [16]7536*7537* "Requires stall bit ([20] of DW1) set."7538*7539* Also, the PIPE_CONTROL instruction table, bit 16 (Generic Media7540* State Clear) says:7541*7542* "PIPECONTROL command with “Command Streamer Stall Enable” must be7543* programmed prior to programming a PIPECONTROL command with "Media7544* State Clear" set in GPGPU mode of operation"7545*7546* This is a subset of the earlier rule, so there's nothing to do.7547*/7548flags |= PIPE_CONTROL_CS_STALL;7549}75507551if (flags & PIPE_CONTROL_STORE_DATA_INDEX) {7552/* Project: All / Argument: Store Data Index7553*7554* "Post-Sync Operation ([15:14] of DW1) must be set to something other7555* than '0'."7556*7557* For now, we just assert that the caller does this. We might want to7558* automatically add a write to the workaround BO...7559*/7560assert(non_lri_post_sync_flags != 0);7561}75627563if (flags & PIPE_CONTROL_SYNC_GFDT) {7564/* Project: All / Argument: Sync GFDT7565*7566* "Post-Sync Operation ([15:14] of DW1) must be set to something other7567* than '0' or 0x2520[13] must be set."7568*7569* For now, we just assert that the caller does this.7570*/7571assert(non_lri_post_sync_flags != 0);7572}75737574if (flags & PIPE_CONTROL_TLB_INVALIDATE) {7575/* Project: IVB+ / Argument: TLB inv7576*7577* "Requires stall bit ([20] of DW1) set."7578*7579* Also, from the PIPE_CONTROL instruction table:7580*7581* "Project: SKL+7582* Post Sync Operation or CS stall must be set to ensure a TLB7583* invalidation occurs. Otherwise no cycle will occur to the TLB7584* cache to invalidate."7585*7586* This is not a subset of the earlier rule, so there's nothing to do.7587*/7588flags |= PIPE_CONTROL_CS_STALL;7589}75907591if (GFX_VER == 9 && devinfo->gt == 4) {7592/* TODO: The big Skylake GT4 post sync op workaround */7593}75947595/* "GPGPU specific workarounds" (both post-sync and flush) ------------ */75967597if (IS_COMPUTE_PIPELINE(batch)) {7598if (GFX_VER >= 9 && (flags & PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE)) {7599/* Project: SKL+ / Argument: Tex Invalidate7600* "Requires stall bit ([20] of DW) set for all GPGPU Workloads."7601*/7602flags |= PIPE_CONTROL_CS_STALL;7603}76047605if (GFX_VER == 8 && (post_sync_flags ||7606(flags & (PIPE_CONTROL_NOTIFY_ENABLE |7607PIPE_CONTROL_DEPTH_STALL |7608PIPE_CONTROL_RENDER_TARGET_FLUSH |7609PIPE_CONTROL_DEPTH_CACHE_FLUSH |7610PIPE_CONTROL_DATA_CACHE_FLUSH)))) {7611/* Project: BDW / Arguments:7612*7613* - LRI Post Sync Operation [23]7614* - Post Sync Op [15:14]7615* - Notify En [8]7616* - Depth Stall [13]7617* - Render Target Cache Flush [12]7618* - Depth Cache Flush [0]7619* - DC Flush Enable [5]7620*7621* "Requires stall bit ([20] of DW) set for all GPGPU and Media7622* Workloads."7623*/7624flags |= PIPE_CONTROL_CS_STALL;76257626/* Also, from the PIPE_CONTROL instruction table, bit 20:7627*7628* "Project: BDW7629* This bit must be always set when PIPE_CONTROL command is7630* programmed by GPGPU and MEDIA workloads, except for the cases7631* when only Read Only Cache Invalidation bits are set (State7632* Cache Invalidation Enable, Instruction cache Invalidation7633* Enable, Texture Cache Invalidation Enable, Constant Cache7634* Invalidation Enable). This is to WA FFDOP CG issue, this WA7635* need not implemented when FF_DOP_CG is disable via "Fixed7636* Function DOP Clock Gate Disable" bit in RC_PSMI_CTRL register."7637*7638* It sounds like we could avoid CS stalls in some cases, but we7639* don't currently bother. This list isn't exactly the list above,7640* either...7641*/7642}7643}76447645/* "Stall" workarounds ----------------------------------------------7646* These have to come after the earlier ones because we may have added7647* some additional CS stalls above.7648*/76497650if (GFX_VER < 9 && (flags & PIPE_CONTROL_CS_STALL)) {7651/* Project: PRE-SKL, VLV, CHV7652*7653* "[All Stepping][All SKUs]:7654*7655* One of the following must also be set:7656*7657* - Render Target Cache Flush Enable ([12] of DW1)7658* - Depth Cache Flush Enable ([0] of DW1)7659* - Stall at Pixel Scoreboard ([1] of DW1)7660* - Depth Stall ([13] of DW1)7661* - Post-Sync Operation ([13] of DW1)7662* - DC Flush Enable ([5] of DW1)"7663*7664* If we don't already have one of those bits set, we choose to add7665* "Stall at Pixel Scoreboard". Some of the other bits require a7666* CS stall as a workaround (see above), which would send us into7667* an infinite recursion of PIPE_CONTROLs. "Stall at Pixel Scoreboard"7668* appears to be safe, so we choose that.7669*/7670const uint32_t wa_bits = PIPE_CONTROL_RENDER_TARGET_FLUSH |7671PIPE_CONTROL_DEPTH_CACHE_FLUSH |7672PIPE_CONTROL_WRITE_IMMEDIATE |7673PIPE_CONTROL_WRITE_DEPTH_COUNT |7674PIPE_CONTROL_WRITE_TIMESTAMP |7675PIPE_CONTROL_STALL_AT_SCOREBOARD |7676PIPE_CONTROL_DEPTH_STALL |7677PIPE_CONTROL_DATA_CACHE_FLUSH;7678if (!(flags & wa_bits))7679flags |= PIPE_CONTROL_STALL_AT_SCOREBOARD;7680}76817682if (GFX_VER >= 12 && (flags & PIPE_CONTROL_DEPTH_CACHE_FLUSH)) {7683/* Wa_1409600907:7684*7685* "PIPE_CONTROL with Depth Stall Enable bit must be set7686* with any PIPE_CONTROL with Depth Flush Enable bit set.7687*/7688flags |= PIPE_CONTROL_DEPTH_STALL;7689}76907691/* Emit --------------------------------------------------------------- */76927693if (INTEL_DEBUG & DEBUG_PIPE_CONTROL) {7694fprintf(stderr,7695" PC [%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%"PRIx64"]: %s\n",7696(flags & PIPE_CONTROL_FLUSH_ENABLE) ? "PipeCon " : "",7697(flags & PIPE_CONTROL_CS_STALL) ? "CS " : "",7698(flags & PIPE_CONTROL_STALL_AT_SCOREBOARD) ? "Scoreboard " : "",7699(flags & PIPE_CONTROL_VF_CACHE_INVALIDATE) ? "VF " : "",7700(flags & PIPE_CONTROL_RENDER_TARGET_FLUSH) ? "RT " : "",7701(flags & PIPE_CONTROL_CONST_CACHE_INVALIDATE) ? "Const " : "",7702(flags & PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE) ? "TC " : "",7703(flags & PIPE_CONTROL_DATA_CACHE_FLUSH) ? "DC " : "",7704(flags & PIPE_CONTROL_DEPTH_CACHE_FLUSH) ? "ZFlush " : "",7705(flags & PIPE_CONTROL_TILE_CACHE_FLUSH) ? "Tile " : "",7706(flags & PIPE_CONTROL_DEPTH_STALL) ? "ZStall " : "",7707(flags & PIPE_CONTROL_STATE_CACHE_INVALIDATE) ? "State " : "",7708(flags & PIPE_CONTROL_TLB_INVALIDATE) ? "TLB " : "",7709(flags & PIPE_CONTROL_INSTRUCTION_INVALIDATE) ? "Inst " : "",7710(flags & PIPE_CONTROL_MEDIA_STATE_CLEAR) ? "MediaClear " : "",7711(flags & PIPE_CONTROL_NOTIFY_ENABLE) ? "Notify " : "",7712(flags & PIPE_CONTROL_GLOBAL_SNAPSHOT_COUNT_RESET) ?7713"SnapRes" : "",7714(flags & PIPE_CONTROL_INDIRECT_STATE_POINTERS_DISABLE) ?7715"ISPDis" : "",7716(flags & PIPE_CONTROL_WRITE_IMMEDIATE) ? "WriteImm " : "",7717(flags & PIPE_CONTROL_WRITE_DEPTH_COUNT) ? "WriteZCount " : "",7718(flags & PIPE_CONTROL_WRITE_TIMESTAMP) ? "WriteTimestamp " : "",7719(flags & PIPE_CONTROL_FLUSH_HDC) ? "HDC " : "",7720imm, reason);7721}77227723batch_mark_sync_for_pipe_control(batch, flags);7724iris_batch_sync_region_start(batch);77257726iris_emit_cmd(batch, GENX(PIPE_CONTROL), pc) {7727#if GFX_VER >= 127728pc.TileCacheFlushEnable = flags & PIPE_CONTROL_TILE_CACHE_FLUSH;7729#endif7730#if GFX_VER >= 117731pc.HDCPipelineFlushEnable = flags & PIPE_CONTROL_FLUSH_HDC;7732#endif7733pc.LRIPostSyncOperation = NoLRIOperation;7734pc.PipeControlFlushEnable = flags & PIPE_CONTROL_FLUSH_ENABLE;7735pc.DCFlushEnable = flags & PIPE_CONTROL_DATA_CACHE_FLUSH;7736pc.StoreDataIndex = 0;7737pc.CommandStreamerStallEnable = flags & PIPE_CONTROL_CS_STALL;7738pc.GlobalSnapshotCountReset =7739flags & PIPE_CONTROL_GLOBAL_SNAPSHOT_COUNT_RESET;7740pc.TLBInvalidate = flags & PIPE_CONTROL_TLB_INVALIDATE;7741pc.GenericMediaStateClear = flags & PIPE_CONTROL_MEDIA_STATE_CLEAR;7742pc.StallAtPixelScoreboard = flags & PIPE_CONTROL_STALL_AT_SCOREBOARD;7743pc.RenderTargetCacheFlushEnable =7744flags & PIPE_CONTROL_RENDER_TARGET_FLUSH;7745pc.DepthCacheFlushEnable = flags & PIPE_CONTROL_DEPTH_CACHE_FLUSH;7746pc.StateCacheInvalidationEnable =7747flags & PIPE_CONTROL_STATE_CACHE_INVALIDATE;7748pc.VFCacheInvalidationEnable = flags & PIPE_CONTROL_VF_CACHE_INVALIDATE;7749pc.ConstantCacheInvalidationEnable =7750flags & PIPE_CONTROL_CONST_CACHE_INVALIDATE;7751pc.PostSyncOperation = flags_to_post_sync_op(flags);7752pc.DepthStallEnable = flags & PIPE_CONTROL_DEPTH_STALL;7753pc.InstructionCacheInvalidateEnable =7754flags & PIPE_CONTROL_INSTRUCTION_INVALIDATE;7755pc.NotifyEnable = flags & PIPE_CONTROL_NOTIFY_ENABLE;7756pc.IndirectStatePointersDisable =7757flags & PIPE_CONTROL_INDIRECT_STATE_POINTERS_DISABLE;7758pc.TextureCacheInvalidationEnable =7759flags & PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE;7760pc.Address = rw_bo(bo, offset, IRIS_DOMAIN_OTHER_WRITE);7761pc.ImmediateData = imm;7762}77637764iris_batch_sync_region_end(batch);7765}77667767#if GFX_VER == 97768/**7769* Preemption on Gfx9 has to be enabled or disabled in various cases.7770*7771* See these workarounds for preemption:7772* - WaDisableMidObjectPreemptionForGSLineStripAdj7773* - WaDisableMidObjectPreemptionForTrifanOrPolygon7774* - WaDisableMidObjectPreemptionForLineLoop7775* - WA#07987776*7777* We don't put this in the vtable because it's only used on Gfx9.7778*/7779void7780gfx9_toggle_preemption(struct iris_context *ice,7781struct iris_batch *batch,7782const struct pipe_draw_info *draw)7783{7784struct iris_genx_state *genx = ice->state.genx;7785bool object_preemption = true;77867787/* WaDisableMidObjectPreemptionForGSLineStripAdj7788*7789* "WA: Disable mid-draw preemption when draw-call is a linestrip_adj7790* and GS is enabled."7791*/7792if (draw->mode == PIPE_PRIM_LINE_STRIP_ADJACENCY &&7793ice->shaders.prog[MESA_SHADER_GEOMETRY])7794object_preemption = false;77957796/* WaDisableMidObjectPreemptionForTrifanOrPolygon7797*7798* "TriFan miscompare in Execlist Preemption test. Cut index that is7799* on a previous context. End the previous, the resume another context7800* with a tri-fan or polygon, and the vertex count is corrupted. If we7801* prempt again we will cause corruption.7802*7803* WA: Disable mid-draw preemption when draw-call has a tri-fan."7804*/7805if (draw->mode == PIPE_PRIM_TRIANGLE_FAN)7806object_preemption = false;78077808/* WaDisableMidObjectPreemptionForLineLoop7809*7810* "VF Stats Counters Missing a vertex when preemption enabled.7811*7812* WA: Disable mid-draw preemption when the draw uses a lineloop7813* topology."7814*/7815if (draw->mode == PIPE_PRIM_LINE_LOOP)7816object_preemption = false;78177818/* WA#07987819*7820* "VF is corrupting GAFS data when preempted on an instance boundary7821* and replayed with instancing enabled.7822*7823* WA: Disable preemption when using instanceing."7824*/7825if (draw->instance_count > 1)7826object_preemption = false;78277828if (genx->object_preemption != object_preemption) {7829iris_enable_obj_preemption(batch, object_preemption);7830genx->object_preemption = object_preemption;7831}7832}7833#endif78347835static void7836iris_lost_genx_state(struct iris_context *ice, struct iris_batch *batch)7837{7838struct iris_genx_state *genx = ice->state.genx;78397840memset(genx->last_index_buffer, 0, sizeof(genx->last_index_buffer));7841}78427843static void7844iris_emit_mi_report_perf_count(struct iris_batch *batch,7845struct iris_bo *bo,7846uint32_t offset_in_bytes,7847uint32_t report_id)7848{7849iris_batch_sync_region_start(batch);7850iris_emit_cmd(batch, GENX(MI_REPORT_PERF_COUNT), mi_rpc) {7851mi_rpc.MemoryAddress = rw_bo(bo, offset_in_bytes,7852IRIS_DOMAIN_OTHER_WRITE);7853mi_rpc.ReportID = report_id;7854}7855iris_batch_sync_region_end(batch);7856}78577858/**7859* Update the pixel hashing modes that determine the balancing of PS threads7860* across subslices and slices.7861*7862* \param width Width bound of the rendering area (already scaled down if \p7863* scale is greater than 1).7864* \param height Height bound of the rendering area (already scaled down if \p7865* scale is greater than 1).7866* \param scale The number of framebuffer samples that could potentially be7867* affected by an individual channel of the PS thread. This is7868* typically one for single-sampled rendering, but for operations7869* like CCS resolves and fast clears a single PS invocation may7870* update a huge number of pixels, in which case a finer7871* balancing is desirable in order to maximally utilize the7872* bandwidth available. UINT_MAX can be used as shorthand for7873* "finest hashing mode available".7874*/7875void7876genX(emit_hashing_mode)(struct iris_context *ice, struct iris_batch *batch,7877unsigned width, unsigned height, unsigned scale)7878{7879#if GFX_VER == 97880const struct intel_device_info *devinfo = &batch->screen->devinfo;7881const unsigned slice_hashing[] = {7882/* Because all Gfx9 platforms with more than one slice require7883* three-way subslice hashing, a single "normal" 16x16 slice hashing7884* block is guaranteed to suffer from substantial imbalance, with one7885* subslice receiving twice as much work as the other two in the7886* slice.7887*7888* The performance impact of that would be particularly severe when7889* three-way hashing is also in use for slice balancing (which is the7890* case for all Gfx9 GT4 platforms), because one of the slices7891* receives one every three 16x16 blocks in either direction, which7892* is roughly the periodicity of the underlying subslice imbalance7893* pattern ("roughly" because in reality the hardware's7894* implementation of three-way hashing doesn't do exact modulo 37895* arithmetic, which somewhat decreases the magnitude of this effect7896* in practice). This leads to a systematic subslice imbalance7897* within that slice regardless of the size of the primitive. The7898* 32x32 hashing mode guarantees that the subslice imbalance within a7899* single slice hashing block is minimal, largely eliminating this7900* effect.7901*/7902_32x32,7903/* Finest slice hashing mode available. */7904NORMAL7905};7906const unsigned subslice_hashing[] = {7907/* 16x16 would provide a slight cache locality benefit especially7908* visible in the sampler L1 cache efficiency of low-bandwidth7909* non-LLC platforms, but it comes at the cost of greater subslice7910* imbalance for primitives of dimensions approximately intermediate7911* between 16x4 and 16x16.7912*/7913_16x4,7914/* Finest subslice hashing mode available. */7915_8x47916};7917/* Dimensions of the smallest hashing block of a given hashing mode. If7918* the rendering area is smaller than this there can't possibly be any7919* benefit from switching to this mode, so we optimize out the7920* transition.7921*/7922const unsigned min_size[][2] = {7923{ 16, 4 },7924{ 8, 4 }7925};7926const unsigned idx = scale > 1;79277928if (width > min_size[idx][0] || height > min_size[idx][1]) {7929iris_emit_raw_pipe_control(batch,7930"workaround: CS stall before GT_MODE LRI",7931PIPE_CONTROL_STALL_AT_SCOREBOARD |7932PIPE_CONTROL_CS_STALL,7933NULL, 0, 0);79347935iris_emit_reg(batch, GENX(GT_MODE), reg) {7936reg.SliceHashing = (devinfo->num_slices > 1 ? slice_hashing[idx] : 0);7937reg.SliceHashingMask = (devinfo->num_slices > 1 ? -1 : 0);7938reg.SubsliceHashing = subslice_hashing[idx];7939reg.SubsliceHashingMask = -1;7940};79417942ice->state.current_hash_scale = scale;7943}7944#endif7945}79467947static void7948iris_set_frontend_noop(struct pipe_context *ctx, bool enable)7949{7950struct iris_context *ice = (struct iris_context *) ctx;79517952if (iris_batch_prepare_noop(&ice->batches[IRIS_BATCH_RENDER], enable)) {7953ice->state.dirty |= IRIS_ALL_DIRTY_FOR_RENDER;7954ice->state.stage_dirty |= IRIS_ALL_STAGE_DIRTY_FOR_RENDER;7955}79567957if (iris_batch_prepare_noop(&ice->batches[IRIS_BATCH_COMPUTE], enable)) {7958ice->state.dirty |= IRIS_ALL_DIRTY_FOR_COMPUTE;7959ice->state.stage_dirty |= IRIS_ALL_STAGE_DIRTY_FOR_COMPUTE;7960}7961}79627963void7964genX(init_screen_state)(struct iris_screen *screen)7965{7966assert(screen->devinfo.verx10 == GFX_VERx10);7967screen->vtbl.destroy_state = iris_destroy_state;7968screen->vtbl.init_render_context = iris_init_render_context;7969screen->vtbl.init_compute_context = iris_init_compute_context;7970screen->vtbl.upload_render_state = iris_upload_render_state;7971screen->vtbl.update_surface_base_address = iris_update_surface_base_address;7972screen->vtbl.upload_compute_state = iris_upload_compute_state;7973screen->vtbl.emit_raw_pipe_control = iris_emit_raw_pipe_control;7974screen->vtbl.emit_mi_report_perf_count = iris_emit_mi_report_perf_count;7975screen->vtbl.rebind_buffer = iris_rebind_buffer;7976screen->vtbl.load_register_reg32 = iris_load_register_reg32;7977screen->vtbl.load_register_reg64 = iris_load_register_reg64;7978screen->vtbl.load_register_imm32 = iris_load_register_imm32;7979screen->vtbl.load_register_imm64 = iris_load_register_imm64;7980screen->vtbl.load_register_mem32 = iris_load_register_mem32;7981screen->vtbl.load_register_mem64 = iris_load_register_mem64;7982screen->vtbl.store_register_mem32 = iris_store_register_mem32;7983screen->vtbl.store_register_mem64 = iris_store_register_mem64;7984screen->vtbl.store_data_imm32 = iris_store_data_imm32;7985screen->vtbl.store_data_imm64 = iris_store_data_imm64;7986screen->vtbl.copy_mem_mem = iris_copy_mem_mem;7987screen->vtbl.derived_program_state_size = iris_derived_program_state_size;7988screen->vtbl.store_derived_program_state = iris_store_derived_program_state;7989screen->vtbl.create_so_decl_list = iris_create_so_decl_list;7990screen->vtbl.populate_vs_key = iris_populate_vs_key;7991screen->vtbl.populate_tcs_key = iris_populate_tcs_key;7992screen->vtbl.populate_tes_key = iris_populate_tes_key;7993screen->vtbl.populate_gs_key = iris_populate_gs_key;7994screen->vtbl.populate_fs_key = iris_populate_fs_key;7995screen->vtbl.populate_cs_key = iris_populate_cs_key;7996screen->vtbl.lost_genx_state = iris_lost_genx_state;7997}79987999void8000genX(init_state)(struct iris_context *ice)8001{8002struct pipe_context *ctx = &ice->ctx;8003struct iris_screen *screen = (struct iris_screen *)ctx->screen;80048005ctx->create_blend_state = iris_create_blend_state;8006ctx->create_depth_stencil_alpha_state = iris_create_zsa_state;8007ctx->create_rasterizer_state = iris_create_rasterizer_state;8008ctx->create_sampler_state = iris_create_sampler_state;8009ctx->create_sampler_view = iris_create_sampler_view;8010ctx->create_surface = iris_create_surface;8011ctx->create_vertex_elements_state = iris_create_vertex_elements;8012ctx->bind_blend_state = iris_bind_blend_state;8013ctx->bind_depth_stencil_alpha_state = iris_bind_zsa_state;8014ctx->bind_sampler_states = iris_bind_sampler_states;8015ctx->bind_rasterizer_state = iris_bind_rasterizer_state;8016ctx->bind_vertex_elements_state = iris_bind_vertex_elements_state;8017ctx->delete_blend_state = iris_delete_state;8018ctx->delete_depth_stencil_alpha_state = iris_delete_state;8019ctx->delete_rasterizer_state = iris_delete_state;8020ctx->delete_sampler_state = iris_delete_state;8021ctx->delete_vertex_elements_state = iris_delete_state;8022ctx->set_blend_color = iris_set_blend_color;8023ctx->set_clip_state = iris_set_clip_state;8024ctx->set_constant_buffer = iris_set_constant_buffer;8025ctx->set_shader_buffers = iris_set_shader_buffers;8026ctx->set_shader_images = iris_set_shader_images;8027ctx->set_sampler_views = iris_set_sampler_views;8028ctx->set_compute_resources = iris_set_compute_resources;8029ctx->set_global_binding = iris_set_global_binding;8030ctx->set_tess_state = iris_set_tess_state;8031ctx->set_framebuffer_state = iris_set_framebuffer_state;8032ctx->set_polygon_stipple = iris_set_polygon_stipple;8033ctx->set_sample_mask = iris_set_sample_mask;8034ctx->set_scissor_states = iris_set_scissor_states;8035ctx->set_stencil_ref = iris_set_stencil_ref;8036ctx->set_vertex_buffers = iris_set_vertex_buffers;8037ctx->set_viewport_states = iris_set_viewport_states;8038ctx->sampler_view_destroy = iris_sampler_view_destroy;8039ctx->surface_destroy = iris_surface_destroy;8040ctx->draw_vbo = iris_draw_vbo;8041ctx->launch_grid = iris_launch_grid;8042ctx->create_stream_output_target = iris_create_stream_output_target;8043ctx->stream_output_target_destroy = iris_stream_output_target_destroy;8044ctx->set_stream_output_targets = iris_set_stream_output_targets;8045ctx->set_frontend_noop = iris_set_frontend_noop;80468047ice->state.dirty = ~0ull;8048ice->state.stage_dirty = ~0ull;80498050ice->state.statistics_counters_enabled = true;80518052ice->state.sample_mask = 0xffff;8053ice->state.num_viewports = 1;8054ice->state.prim_mode = PIPE_PRIM_MAX;8055ice->state.genx = calloc(1, sizeof(struct iris_genx_state));8056ice->draw.derived_params.drawid = -1;80578058/* Make a 1x1x1 null surface for unbound textures */8059void *null_surf_map =8060upload_state(ice->state.surface_uploader, &ice->state.unbound_tex,80614 * GENX(RENDER_SURFACE_STATE_length), 64);8062isl_null_fill_state(&screen->isl_dev, null_surf_map,8063.size = isl_extent3d(1, 1, 1));8064ice->state.unbound_tex.offset +=8065iris_bo_offset_from_base_address(iris_resource_bo(ice->state.unbound_tex.res));80668067/* Default all scissor rectangles to be empty regions. */8068for (int i = 0; i < IRIS_MAX_VIEWPORTS; i++) {8069ice->state.scissors[i] = (struct pipe_scissor_state) {8070.minx = 1, .maxx = 0, .miny = 1, .maxy = 0,8071};8072}8073}807480758076