Path: blob/21.2-virgl/src/intel/blorp/blorp_blit.c
7178 views
/*1* Copyright © 2012 Intel Corporation2*3* Permission is hereby granted, free of charge, to any person obtaining a4* copy of this software and associated documentation files (the "Software"),5* to deal in the Software without restriction, including without limitation6* the rights to use, copy, modify, merge, publish, distribute, sublicense,7* and/or sell copies of the Software, and to permit persons to whom the8* Software is furnished to do so, subject to the following conditions:9*10* The above copyright notice and this permission notice (including the next11* paragraph) shall be included in all copies or substantial portions of the12* Software.13*14* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR15* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,16* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL17* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER18* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING19* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS20* IN THE SOFTWARE.21*/2223#include "blorp_nir_builder.h"24#include "compiler/nir/nir_format_convert.h"2526#include "blorp_priv.h"2728#include "util/format_rgb9e5.h"29/* header-only include needed for _mesa_unorm_to_float and friends. */30#include "mesa/main/format_utils.h"31#include "util/u_math.h"3233#define FILE_DEBUG_FLAG DEBUG_BLORP3435static const bool split_blorp_blit_debug = false;3637struct brw_blorp_blit_vars {38/* Input values from brw_blorp_wm_inputs */39nir_variable *v_discard_rect;40nir_variable *v_rect_grid;41nir_variable *v_coord_transform;42nir_variable *v_src_z;43nir_variable *v_src_offset;44nir_variable *v_dst_offset;45nir_variable *v_src_inv_size;46};4748static void49brw_blorp_blit_vars_init(nir_builder *b, struct brw_blorp_blit_vars *v,50const struct brw_blorp_blit_prog_key *key)51{52#define LOAD_INPUT(name, type)\53v->v_##name = BLORP_CREATE_NIR_INPUT(b->shader, name, type);5455LOAD_INPUT(discard_rect, glsl_vec4_type())56LOAD_INPUT(rect_grid, glsl_vec4_type())57LOAD_INPUT(coord_transform, glsl_vec4_type())58LOAD_INPUT(src_z, glsl_float_type())59LOAD_INPUT(src_offset, glsl_vector_type(GLSL_TYPE_UINT, 2))60LOAD_INPUT(dst_offset, glsl_vector_type(GLSL_TYPE_UINT, 2))61LOAD_INPUT(src_inv_size, glsl_vector_type(GLSL_TYPE_FLOAT, 2))6263#undef LOAD_INPUT64}6566static nir_ssa_def *67blorp_blit_get_frag_coords(nir_builder *b,68const struct brw_blorp_blit_prog_key *key,69struct brw_blorp_blit_vars *v)70{71nir_ssa_def *coord = nir_f2i32(b, nir_load_frag_coord(b));7273/* Account for destination surface intratile offset74*75* Transformation parameters giving translation from destination to source76* coordinates don't take into account possible intra-tile destination77* offset. Therefore it has to be first subtracted from the incoming78* coordinates. Vertices are set up based on coordinates containing the79* intra-tile offset.80*/81if (key->need_dst_offset)82coord = nir_isub(b, coord, nir_load_var(b, v->v_dst_offset));8384if (key->persample_msaa_dispatch) {85return nir_vec3(b, nir_channel(b, coord, 0), nir_channel(b, coord, 1),86nir_load_sample_id(b));87} else {88return nir_vec2(b, nir_channel(b, coord, 0), nir_channel(b, coord, 1));89}90}9192/**93* Emit code to translate from destination (X, Y) coordinates to source (X, Y)94* coordinates.95*/96static nir_ssa_def *97blorp_blit_apply_transform(nir_builder *b, nir_ssa_def *src_pos,98struct brw_blorp_blit_vars *v)99{100nir_ssa_def *coord_transform = nir_load_var(b, v->v_coord_transform);101102nir_ssa_def *offset = nir_vec2(b, nir_channel(b, coord_transform, 1),103nir_channel(b, coord_transform, 3));104nir_ssa_def *mul = nir_vec2(b, nir_channel(b, coord_transform, 0),105nir_channel(b, coord_transform, 2));106107return nir_fadd(b, nir_fmul(b, src_pos, mul), offset);108}109110static inline void111blorp_nir_discard_if_outside_rect(nir_builder *b, nir_ssa_def *pos,112struct brw_blorp_blit_vars *v)113{114nir_ssa_def *c0, *c1, *c2, *c3;115nir_ssa_def *discard_rect = nir_load_var(b, v->v_discard_rect);116nir_ssa_def *dst_x0 = nir_channel(b, discard_rect, 0);117nir_ssa_def *dst_x1 = nir_channel(b, discard_rect, 1);118nir_ssa_def *dst_y0 = nir_channel(b, discard_rect, 2);119nir_ssa_def *dst_y1 = nir_channel(b, discard_rect, 3);120121c0 = nir_ult(b, nir_channel(b, pos, 0), dst_x0);122c1 = nir_uge(b, nir_channel(b, pos, 0), dst_x1);123c2 = nir_ult(b, nir_channel(b, pos, 1), dst_y0);124c3 = nir_uge(b, nir_channel(b, pos, 1), dst_y1);125126nir_ssa_def *oob = nir_ior(b, nir_ior(b, c0, c1), nir_ior(b, c2, c3));127nir_discard_if(b, oob);128}129130static nir_tex_instr *131blorp_create_nir_tex_instr(nir_builder *b, struct brw_blorp_blit_vars *v,132nir_texop op, nir_ssa_def *pos, unsigned num_srcs,133nir_alu_type dst_type)134{135nir_tex_instr *tex = nir_tex_instr_create(b->shader, num_srcs);136137tex->op = op;138139tex->dest_type = dst_type | 32;140tex->is_array = false;141tex->is_shadow = false;142143/* Blorp only has one texture and it's bound at unit 0 */144tex->texture_index = 0;145tex->sampler_index = 0;146147/* To properly handle 3-D and 2-D array textures, we pull the Z component148* from an input. TODO: This is a bit magic; we should probably make this149* more explicit in the future.150*/151assert(pos->num_components >= 2);152if (op == nir_texop_txf || op == nir_texop_txf_ms || op == nir_texop_txf_ms_mcs) {153pos = nir_vec3(b, nir_channel(b, pos, 0), nir_channel(b, pos, 1),154nir_f2i32(b, nir_load_var(b, v->v_src_z)));155} else {156pos = nir_vec3(b, nir_channel(b, pos, 0), nir_channel(b, pos, 1),157nir_load_var(b, v->v_src_z));158}159160tex->src[0].src_type = nir_tex_src_coord;161tex->src[0].src = nir_src_for_ssa(pos);162tex->coord_components = 3;163164nir_ssa_dest_init(&tex->instr, &tex->dest, 4, 32, NULL);165166return tex;167}168169static nir_ssa_def *170blorp_nir_tex(nir_builder *b, struct brw_blorp_blit_vars *v,171const struct brw_blorp_blit_prog_key *key, nir_ssa_def *pos)172{173if (key->need_src_offset)174pos = nir_fadd(b, pos, nir_i2f32(b, nir_load_var(b, v->v_src_offset)));175176/* If the sampler requires normalized coordinates, we need to compensate. */177if (key->src_coords_normalized)178pos = nir_fmul(b, pos, nir_load_var(b, v->v_src_inv_size));179180nir_tex_instr *tex =181blorp_create_nir_tex_instr(b, v, nir_texop_tex, pos, 2,182key->texture_data_type);183184assert(pos->num_components == 2);185tex->sampler_dim = GLSL_SAMPLER_DIM_2D;186tex->src[1].src_type = nir_tex_src_lod;187tex->src[1].src = nir_src_for_ssa(nir_imm_int(b, 0));188189nir_builder_instr_insert(b, &tex->instr);190191return &tex->dest.ssa;192}193194static nir_ssa_def *195blorp_nir_txf(nir_builder *b, struct brw_blorp_blit_vars *v,196nir_ssa_def *pos, nir_alu_type dst_type)197{198nir_tex_instr *tex =199blorp_create_nir_tex_instr(b, v, nir_texop_txf, pos, 2, dst_type);200201tex->sampler_dim = GLSL_SAMPLER_DIM_3D;202tex->src[1].src_type = nir_tex_src_lod;203tex->src[1].src = nir_src_for_ssa(nir_imm_int(b, 0));204205nir_builder_instr_insert(b, &tex->instr);206207return &tex->dest.ssa;208}209210static nir_ssa_def *211blorp_nir_txf_ms(nir_builder *b, struct brw_blorp_blit_vars *v,212nir_ssa_def *pos, nir_ssa_def *mcs, nir_alu_type dst_type)213{214nir_tex_instr *tex =215blorp_create_nir_tex_instr(b, v, nir_texop_txf_ms, pos,216mcs != NULL ? 3 : 2, dst_type);217218tex->sampler_dim = GLSL_SAMPLER_DIM_MS;219220tex->src[1].src_type = nir_tex_src_ms_index;221if (pos->num_components == 2) {222tex->src[1].src = nir_src_for_ssa(nir_imm_int(b, 0));223} else {224assert(pos->num_components == 3);225tex->src[1].src = nir_src_for_ssa(nir_channel(b, pos, 2));226}227228if (mcs) {229tex->src[2].src_type = nir_tex_src_ms_mcs;230tex->src[2].src = nir_src_for_ssa(mcs);231}232233nir_builder_instr_insert(b, &tex->instr);234235return &tex->dest.ssa;236}237238static nir_ssa_def *239blorp_blit_txf_ms_mcs(nir_builder *b, struct brw_blorp_blit_vars *v,240nir_ssa_def *pos)241{242nir_tex_instr *tex =243blorp_create_nir_tex_instr(b, v, nir_texop_txf_ms_mcs,244pos, 1, nir_type_int);245246tex->sampler_dim = GLSL_SAMPLER_DIM_MS;247248nir_builder_instr_insert(b, &tex->instr);249250return &tex->dest.ssa;251}252253/**254* Emit code to compensate for the difference between Y and W tiling.255*256* This code modifies the X and Y coordinates according to the formula:257*258* (X', Y', S') = detile(W-MAJOR, tile(Y-MAJOR, X, Y, S))259*260* (See brw_blorp_build_nir_shader).261*/262static inline nir_ssa_def *263blorp_nir_retile_y_to_w(nir_builder *b, nir_ssa_def *pos)264{265assert(pos->num_components == 2);266nir_ssa_def *x_Y = nir_channel(b, pos, 0);267nir_ssa_def *y_Y = nir_channel(b, pos, 1);268269/* Given X and Y coordinates that describe an address using Y tiling,270* translate to the X and Y coordinates that describe the same address271* using W tiling.272*273* If we break down the low order bits of X and Y, using a274* single letter to represent each low-order bit:275*276* X = A << 7 | 0bBCDEFGH277* Y = J << 5 | 0bKLMNP (1)278*279* Then we can apply the Y tiling formula to see the memory offset being280* addressed:281*282* offset = (J * tile_pitch + A) << 12 | 0bBCDKLMNPEFGH (2)283*284* If we apply the W detiling formula to this memory location, that the285* corresponding X' and Y' coordinates are:286*287* X' = A << 6 | 0bBCDPFH (3)288* Y' = J << 6 | 0bKLMNEG289*290* Combining (1) and (3), we see that to transform (X, Y) to (X', Y'),291* we need to make the following computation:292*293* X' = (X & ~0b1011) >> 1 | (Y & 0b1) << 2 | X & 0b1 (4)294* Y' = (Y & ~0b1) << 1 | (X & 0b1000) >> 2 | (X & 0b10) >> 1295*/296nir_ssa_def *x_W = nir_imm_int(b, 0);297x_W = nir_mask_shift_or(b, x_W, x_Y, 0xfffffff4, -1);298x_W = nir_mask_shift_or(b, x_W, y_Y, 0x1, 2);299x_W = nir_mask_shift_or(b, x_W, x_Y, 0x1, 0);300301nir_ssa_def *y_W = nir_imm_int(b, 0);302y_W = nir_mask_shift_or(b, y_W, y_Y, 0xfffffffe, 1);303y_W = nir_mask_shift_or(b, y_W, x_Y, 0x8, -2);304y_W = nir_mask_shift_or(b, y_W, x_Y, 0x2, -1);305306return nir_vec2(b, x_W, y_W);307}308309/**310* Emit code to compensate for the difference between Y and W tiling.311*312* This code modifies the X and Y coordinates according to the formula:313*314* (X', Y', S') = detile(Y-MAJOR, tile(W-MAJOR, X, Y, S))315*316* (See brw_blorp_build_nir_shader).317*/318static inline nir_ssa_def *319blorp_nir_retile_w_to_y(nir_builder *b, nir_ssa_def *pos)320{321assert(pos->num_components == 2);322nir_ssa_def *x_W = nir_channel(b, pos, 0);323nir_ssa_def *y_W = nir_channel(b, pos, 1);324325/* Applying the same logic as above, but in reverse, we obtain the326* formulas:327*328* X' = (X & ~0b101) << 1 | (Y & 0b10) << 2 | (Y & 0b1) << 1 | X & 0b1329* Y' = (Y & ~0b11) >> 1 | (X & 0b100) >> 2330*/331nir_ssa_def *x_Y = nir_imm_int(b, 0);332x_Y = nir_mask_shift_or(b, x_Y, x_W, 0xfffffffa, 1);333x_Y = nir_mask_shift_or(b, x_Y, y_W, 0x2, 2);334x_Y = nir_mask_shift_or(b, x_Y, y_W, 0x1, 1);335x_Y = nir_mask_shift_or(b, x_Y, x_W, 0x1, 0);336337nir_ssa_def *y_Y = nir_imm_int(b, 0);338y_Y = nir_mask_shift_or(b, y_Y, y_W, 0xfffffffc, -1);339y_Y = nir_mask_shift_or(b, y_Y, x_W, 0x4, -2);340341return nir_vec2(b, x_Y, y_Y);342}343344/**345* Emit code to compensate for the difference between MSAA and non-MSAA346* surfaces.347*348* This code modifies the X and Y coordinates according to the formula:349*350* (X', Y', S') = encode_msaa(num_samples, IMS, X, Y, S)351*352* (See brw_blorp_blit_program).353*/354static inline nir_ssa_def *355blorp_nir_encode_msaa(nir_builder *b, nir_ssa_def *pos,356unsigned num_samples, enum isl_msaa_layout layout)357{358assert(pos->num_components == 2 || pos->num_components == 3);359360switch (layout) {361case ISL_MSAA_LAYOUT_NONE:362assert(pos->num_components == 2);363return pos;364case ISL_MSAA_LAYOUT_ARRAY:365/* No translation needed */366return pos;367case ISL_MSAA_LAYOUT_INTERLEAVED: {368nir_ssa_def *x_in = nir_channel(b, pos, 0);369nir_ssa_def *y_in = nir_channel(b, pos, 1);370nir_ssa_def *s_in = pos->num_components == 2 ? nir_imm_int(b, 0) :371nir_channel(b, pos, 2);372373nir_ssa_def *x_out = nir_imm_int(b, 0);374nir_ssa_def *y_out = nir_imm_int(b, 0);375switch (num_samples) {376case 2:377case 4:378/* encode_msaa(2, IMS, X, Y, S) = (X', Y', 0)379* where X' = (X & ~0b1) << 1 | (S & 0b1) << 1 | (X & 0b1)380* Y' = Y381*382* encode_msaa(4, IMS, X, Y, S) = (X', Y', 0)383* where X' = (X & ~0b1) << 1 | (S & 0b1) << 1 | (X & 0b1)384* Y' = (Y & ~0b1) << 1 | (S & 0b10) | (Y & 0b1)385*/386x_out = nir_mask_shift_or(b, x_out, x_in, 0xfffffffe, 1);387x_out = nir_mask_shift_or(b, x_out, s_in, 0x1, 1);388x_out = nir_mask_shift_or(b, x_out, x_in, 0x1, 0);389if (num_samples == 2) {390y_out = y_in;391} else {392y_out = nir_mask_shift_or(b, y_out, y_in, 0xfffffffe, 1);393y_out = nir_mask_shift_or(b, y_out, s_in, 0x2, 0);394y_out = nir_mask_shift_or(b, y_out, y_in, 0x1, 0);395}396break;397398case 8:399/* encode_msaa(8, IMS, X, Y, S) = (X', Y', 0)400* where X' = (X & ~0b1) << 2 | (S & 0b100) | (S & 0b1) << 1401* | (X & 0b1)402* Y' = (Y & ~0b1) << 1 | (S & 0b10) | (Y & 0b1)403*/404x_out = nir_mask_shift_or(b, x_out, x_in, 0xfffffffe, 2);405x_out = nir_mask_shift_or(b, x_out, s_in, 0x4, 0);406x_out = nir_mask_shift_or(b, x_out, s_in, 0x1, 1);407x_out = nir_mask_shift_or(b, x_out, x_in, 0x1, 0);408y_out = nir_mask_shift_or(b, y_out, y_in, 0xfffffffe, 1);409y_out = nir_mask_shift_or(b, y_out, s_in, 0x2, 0);410y_out = nir_mask_shift_or(b, y_out, y_in, 0x1, 0);411break;412413case 16:414/* encode_msaa(16, IMS, X, Y, S) = (X', Y', 0)415* where X' = (X & ~0b1) << 2 | (S & 0b100) | (S & 0b1) << 1416* | (X & 0b1)417* Y' = (Y & ~0b1) << 2 | (S & 0b1000) >> 1 (S & 0b10)418* | (Y & 0b1)419*/420x_out = nir_mask_shift_or(b, x_out, x_in, 0xfffffffe, 2);421x_out = nir_mask_shift_or(b, x_out, s_in, 0x4, 0);422x_out = nir_mask_shift_or(b, x_out, s_in, 0x1, 1);423x_out = nir_mask_shift_or(b, x_out, x_in, 0x1, 0);424y_out = nir_mask_shift_or(b, y_out, y_in, 0xfffffffe, 2);425y_out = nir_mask_shift_or(b, y_out, s_in, 0x8, -1);426y_out = nir_mask_shift_or(b, y_out, s_in, 0x2, 0);427y_out = nir_mask_shift_or(b, y_out, y_in, 0x1, 0);428break;429430default:431unreachable("Invalid number of samples for IMS layout");432}433434return nir_vec2(b, x_out, y_out);435}436437default:438unreachable("Invalid MSAA layout");439}440}441442/**443* Emit code to compensate for the difference between MSAA and non-MSAA444* surfaces.445*446* This code modifies the X and Y coordinates according to the formula:447*448* (X', Y', S) = decode_msaa(num_samples, IMS, X, Y, S)449*450* (See brw_blorp_blit_program).451*/452static inline nir_ssa_def *453blorp_nir_decode_msaa(nir_builder *b, nir_ssa_def *pos,454unsigned num_samples, enum isl_msaa_layout layout)455{456assert(pos->num_components == 2 || pos->num_components == 3);457458switch (layout) {459case ISL_MSAA_LAYOUT_NONE:460/* No translation necessary, and S should already be zero. */461assert(pos->num_components == 2);462return pos;463case ISL_MSAA_LAYOUT_ARRAY:464/* No translation necessary. */465return pos;466case ISL_MSAA_LAYOUT_INTERLEAVED: {467assert(pos->num_components == 2);468469nir_ssa_def *x_in = nir_channel(b, pos, 0);470nir_ssa_def *y_in = nir_channel(b, pos, 1);471472nir_ssa_def *x_out = nir_imm_int(b, 0);473nir_ssa_def *y_out = nir_imm_int(b, 0);474nir_ssa_def *s_out = nir_imm_int(b, 0);475switch (num_samples) {476case 2:477case 4:478/* decode_msaa(2, IMS, X, Y, 0) = (X', Y', S)479* where X' = (X & ~0b11) >> 1 | (X & 0b1)480* S = (X & 0b10) >> 1481*482* decode_msaa(4, IMS, X, Y, 0) = (X', Y', S)483* where X' = (X & ~0b11) >> 1 | (X & 0b1)484* Y' = (Y & ~0b11) >> 1 | (Y & 0b1)485* S = (Y & 0b10) | (X & 0b10) >> 1486*/487x_out = nir_mask_shift_or(b, x_out, x_in, 0xfffffffc, -1);488x_out = nir_mask_shift_or(b, x_out, x_in, 0x1, 0);489if (num_samples == 2) {490y_out = y_in;491s_out = nir_mask_shift_or(b, s_out, x_in, 0x2, -1);492} else {493y_out = nir_mask_shift_or(b, y_out, y_in, 0xfffffffc, -1);494y_out = nir_mask_shift_or(b, y_out, y_in, 0x1, 0);495s_out = nir_mask_shift_or(b, s_out, x_in, 0x2, -1);496s_out = nir_mask_shift_or(b, s_out, y_in, 0x2, 0);497}498break;499500case 8:501/* decode_msaa(8, IMS, X, Y, 0) = (X', Y', S)502* where X' = (X & ~0b111) >> 2 | (X & 0b1)503* Y' = (Y & ~0b11) >> 1 | (Y & 0b1)504* S = (X & 0b100) | (Y & 0b10) | (X & 0b10) >> 1505*/506x_out = nir_mask_shift_or(b, x_out, x_in, 0xfffffff8, -2);507x_out = nir_mask_shift_or(b, x_out, x_in, 0x1, 0);508y_out = nir_mask_shift_or(b, y_out, y_in, 0xfffffffc, -1);509y_out = nir_mask_shift_or(b, y_out, y_in, 0x1, 0);510s_out = nir_mask_shift_or(b, s_out, x_in, 0x4, 0);511s_out = nir_mask_shift_or(b, s_out, y_in, 0x2, 0);512s_out = nir_mask_shift_or(b, s_out, x_in, 0x2, -1);513break;514515case 16:516/* decode_msaa(16, IMS, X, Y, 0) = (X', Y', S)517* where X' = (X & ~0b111) >> 2 | (X & 0b1)518* Y' = (Y & ~0b111) >> 2 | (Y & 0b1)519* S = (Y & 0b100) << 1 | (X & 0b100) |520* (Y & 0b10) | (X & 0b10) >> 1521*/522x_out = nir_mask_shift_or(b, x_out, x_in, 0xfffffff8, -2);523x_out = nir_mask_shift_or(b, x_out, x_in, 0x1, 0);524y_out = nir_mask_shift_or(b, y_out, y_in, 0xfffffff8, -2);525y_out = nir_mask_shift_or(b, y_out, y_in, 0x1, 0);526s_out = nir_mask_shift_or(b, s_out, y_in, 0x4, 1);527s_out = nir_mask_shift_or(b, s_out, x_in, 0x4, 0);528s_out = nir_mask_shift_or(b, s_out, y_in, 0x2, 0);529s_out = nir_mask_shift_or(b, s_out, x_in, 0x2, -1);530break;531532default:533unreachable("Invalid number of samples for IMS layout");534}535536return nir_vec3(b, x_out, y_out, s_out);537}538539default:540unreachable("Invalid MSAA layout");541}542}543544/**545* Count the number of trailing 1 bits in the given value. For example:546*547* count_trailing_one_bits(0) == 0548* count_trailing_one_bits(7) == 3549* count_trailing_one_bits(11) == 2550*/551static inline int count_trailing_one_bits(unsigned value)552{553#ifdef HAVE___BUILTIN_CTZ554return __builtin_ctz(~value);555#else556return util_bitcount(value & ~(value + 1));557#endif558}559560static nir_ssa_def *561blorp_nir_combine_samples(nir_builder *b, struct brw_blorp_blit_vars *v,562nir_ssa_def *pos, unsigned tex_samples,563enum isl_aux_usage tex_aux_usage,564nir_alu_type dst_type,565enum blorp_filter filter)566{567nir_variable *color =568nir_local_variable_create(b->impl, glsl_vec4_type(), "color");569570nir_ssa_def *mcs = NULL;571if (isl_aux_usage_has_mcs(tex_aux_usage))572mcs = blorp_blit_txf_ms_mcs(b, v, pos);573574nir_op combine_op;575switch (filter) {576case BLORP_FILTER_AVERAGE:577assert(dst_type == nir_type_float);578combine_op = nir_op_fadd;579break;580581case BLORP_FILTER_MIN_SAMPLE:582switch (dst_type) {583case nir_type_int: combine_op = nir_op_imin; break;584case nir_type_uint: combine_op = nir_op_umin; break;585case nir_type_float: combine_op = nir_op_fmin; break;586default: unreachable("Invalid dst_type");587}588break;589590case BLORP_FILTER_MAX_SAMPLE:591switch (dst_type) {592case nir_type_int: combine_op = nir_op_imax; break;593case nir_type_uint: combine_op = nir_op_umax; break;594case nir_type_float: combine_op = nir_op_fmax; break;595default: unreachable("Invalid dst_type");596}597break;598599default:600unreachable("Invalid filter");601}602603/* If true, we inserted an if statement that we need to pop at at the end.604*/605bool inserted_if = false;606607/* We add together samples using a binary tree structure, e.g. for 4x MSAA:608*609* result = ((sample[0] + sample[1]) + (sample[2] + sample[3])) / 4610*611* This ensures that when all samples have the same value, no numerical612* precision is lost, since each addition operation always adds two equal613* values, and summing two equal floating point values does not lose614* precision.615*616* We perform this computation by treating the texture_data array as a617* stack and performing the following operations:618*619* - push sample 0 onto stack620* - push sample 1 onto stack621* - add top two stack entries622* - push sample 2 onto stack623* - push sample 3 onto stack624* - add top two stack entries625* - add top two stack entries626* - divide top stack entry by 4627*628* Note that after pushing sample i onto the stack, the number of add629* operations we do is equal to the number of trailing 1 bits in i. This630* works provided the total number of samples is a power of two, which it631* always is for i965.632*633* For integer formats, we replace the add operations with average634* operations and skip the final division.635*/636nir_ssa_def *texture_data[5];637texture_data[0] = NULL; /* Avoid maybe-uninitialized warning with GCC 10 */638unsigned stack_depth = 0;639for (unsigned i = 0; i < tex_samples; ++i) {640assert(stack_depth == util_bitcount(i)); /* Loop invariant */641642/* Push sample i onto the stack */643assert(stack_depth < ARRAY_SIZE(texture_data));644645nir_ssa_def *ms_pos = nir_vec3(b, nir_channel(b, pos, 0),646nir_channel(b, pos, 1),647nir_imm_int(b, i));648texture_data[stack_depth++] = blorp_nir_txf_ms(b, v, ms_pos, mcs, dst_type);649650if (i == 0 && isl_aux_usage_has_mcs(tex_aux_usage)) {651/* The Ivy Bridge PRM, Vol4 Part1 p27 (Multisample Control Surface)652* suggests an optimization:653*654* "A simple optimization with probable large return in655* performance is to compare the MCS value to zero (indicating656* all samples are on sample slice 0), and sample only from657* sample slice 0 using ld2dss if MCS is zero."658*659* Note that in the case where the MCS value is zero, sampling from660* sample slice 0 using ld2dss and sampling from sample 0 using661* ld2dms are equivalent (since all samples are on sample slice 0).662* Since we have already sampled from sample 0, all we need to do is663* skip the remaining fetches and averaging if MCS is zero.664*665* It's also trivial to detect when the MCS has the magic clear color666* value. In this case, the txf we did on sample 0 will return the667* clear color and we can skip the remaining fetches just like we do668* when MCS == 0.669*/670nir_ssa_def *mcs_zero = nir_ieq_imm(b, nir_channel(b, mcs, 0), 0);671if (tex_samples == 16) {672mcs_zero = nir_iand(b, mcs_zero,673nir_ieq_imm(b, nir_channel(b, mcs, 1), 0));674}675nir_ssa_def *mcs_clear =676blorp_nir_mcs_is_clear_color(b, mcs, tex_samples);677678nir_push_if(b, nir_ior(b, mcs_zero, mcs_clear));679nir_store_var(b, color, texture_data[0], 0xf);680681nir_push_else(b, NULL);682inserted_if = true;683}684685for (int j = 0; j < count_trailing_one_bits(i); j++) {686assert(stack_depth >= 2);687--stack_depth;688689texture_data[stack_depth - 1] =690nir_build_alu(b, combine_op,691texture_data[stack_depth - 1],692texture_data[stack_depth],693NULL, NULL);694}695}696697/* We should have just 1 sample on the stack now. */698assert(stack_depth == 1);699700if (filter == BLORP_FILTER_AVERAGE) {701assert(dst_type == nir_type_float);702texture_data[0] = nir_fmul(b, texture_data[0],703nir_imm_float(b, 1.0 / tex_samples));704}705706nir_store_var(b, color, texture_data[0], 0xf);707708if (inserted_if)709nir_pop_if(b, NULL);710711return nir_load_var(b, color);712}713714static nir_ssa_def *715blorp_nir_manual_blend_bilinear(nir_builder *b, nir_ssa_def *pos,716unsigned tex_samples,717const struct brw_blorp_blit_prog_key *key,718struct brw_blorp_blit_vars *v)719{720nir_ssa_def *pos_xy = nir_channels(b, pos, 0x3);721nir_ssa_def *rect_grid = nir_load_var(b, v->v_rect_grid);722nir_ssa_def *scale = nir_imm_vec2(b, key->x_scale, key->y_scale);723724/* Translate coordinates to lay out the samples in a rectangular grid725* roughly corresponding to sample locations.726*/727pos_xy = nir_fmul(b, pos_xy, scale);728/* Adjust coordinates so that integers represent pixel centers rather729* than pixel edges.730*/731pos_xy = nir_fadd(b, pos_xy, nir_imm_float(b, -0.5));732/* Clamp the X, Y texture coordinates to properly handle the sampling of733* texels on texture edges.734*/735pos_xy = nir_fmin(b, nir_fmax(b, pos_xy, nir_imm_float(b, 0.0)),736nir_vec2(b, nir_channel(b, rect_grid, 0),737nir_channel(b, rect_grid, 1)));738739/* Store the fractional parts to be used as bilinear interpolation740* coefficients.741*/742nir_ssa_def *frac_xy = nir_ffract(b, pos_xy);743/* Round the float coordinates down to nearest integer */744pos_xy = nir_fdiv(b, nir_ftrunc(b, pos_xy), scale);745746nir_ssa_def *tex_data[4];747for (unsigned i = 0; i < 4; ++i) {748float sample_off_x = (float)(i & 0x1) / key->x_scale;749float sample_off_y = (float)((i >> 1) & 0x1) / key->y_scale;750nir_ssa_def *sample_off = nir_imm_vec2(b, sample_off_x, sample_off_y);751752nir_ssa_def *sample_coords = nir_fadd(b, pos_xy, sample_off);753nir_ssa_def *sample_coords_int = nir_f2i32(b, sample_coords);754755/* The MCS value we fetch has to match up with the pixel that we're756* sampling from. Since we sample from different pixels in each757* iteration of this "for" loop, the call to mcs_fetch() should be758* here inside the loop after computing the pixel coordinates.759*/760nir_ssa_def *mcs = NULL;761if (isl_aux_usage_has_mcs(key->tex_aux_usage))762mcs = blorp_blit_txf_ms_mcs(b, v, sample_coords_int);763764/* Compute sample index and map the sample index to a sample number.765* Sample index layout shows the numbering of slots in a rectangular766* grid of samples with in a pixel. Sample number layout shows the767* rectangular grid of samples roughly corresponding to the real sample768* locations with in a pixel.769*770* In the case of 2x MSAA, the layout of sample indices is reversed from771* the layout of sample numbers:772*773* sample index layout : --------- sample number layout : ---------774* | 0 | 1 | | 1 | 0 |775* --------- ---------776*777* In case of 4x MSAA, layout of sample indices matches the layout of778* sample numbers:779* ---------780* | 0 | 1 |781* ---------782* | 2 | 3 |783* ---------784*785* In case of 8x MSAA the two layouts don't match.786* sample index layout : --------- sample number layout : ---------787* | 0 | 1 | | 3 | 7 |788* --------- ---------789* | 2 | 3 | | 5 | 0 |790* --------- ---------791* | 4 | 5 | | 1 | 2 |792* --------- ---------793* | 6 | 7 | | 4 | 6 |794* --------- ---------795*796* Fortunately, this can be done fairly easily as:797* S' = (0x17306425 >> (S * 4)) & 0xf798*799* In the case of 16x MSAA the two layouts don't match.800* Sample index layout: Sample number layout:801* --------------------- ---------------------802* | 0 | 1 | 2 | 3 | | 15 | 10 | 9 | 7 |803* --------------------- ---------------------804* | 4 | 5 | 6 | 7 | | 4 | 1 | 3 | 13 |805* --------------------- ---------------------806* | 8 | 9 | 10 | 11 | | 12 | 2 | 0 | 6 |807* --------------------- ---------------------808* | 12 | 13 | 14 | 15 | | 11 | 8 | 5 | 14 |809* --------------------- ---------------------810*811* This is equivalent to812* S' = (0xe58b602cd31479af >> (S * 4)) & 0xf813*/814nir_ssa_def *frac = nir_ffract(b, sample_coords);815nir_ssa_def *sample =816nir_fdot2(b, frac, nir_imm_vec2(b, key->x_scale,817key->x_scale * key->y_scale));818sample = nir_f2i32(b, sample);819820if (tex_samples == 2) {821sample = nir_isub(b, nir_imm_int(b, 1), sample);822} else if (tex_samples == 8) {823sample = nir_iand(b, nir_ishr(b, nir_imm_int(b, 0x64210573),824nir_ishl(b, sample, nir_imm_int(b, 2))),825nir_imm_int(b, 0xf));826} else if (tex_samples == 16) {827nir_ssa_def *sample_low =828nir_iand(b, nir_ishr(b, nir_imm_int(b, 0xd31479af),829nir_ishl(b, sample, nir_imm_int(b, 2))),830nir_imm_int(b, 0xf));831nir_ssa_def *sample_high =832nir_iand(b, nir_ishr(b, nir_imm_int(b, 0xe58b602c),833nir_ishl(b, nir_iadd(b, sample,834nir_imm_int(b, -8)),835nir_imm_int(b, 2))),836nir_imm_int(b, 0xf));837838sample = nir_bcsel(b, nir_ilt(b, sample, nir_imm_int(b, 8)),839sample_low, sample_high);840}841nir_ssa_def *pos_ms = nir_vec3(b, nir_channel(b, sample_coords_int, 0),842nir_channel(b, sample_coords_int, 1),843sample);844tex_data[i] = blorp_nir_txf_ms(b, v, pos_ms, mcs, key->texture_data_type);845}846847nir_ssa_def *frac_x = nir_channel(b, frac_xy, 0);848nir_ssa_def *frac_y = nir_channel(b, frac_xy, 1);849return nir_flrp(b, nir_flrp(b, tex_data[0], tex_data[1], frac_x),850nir_flrp(b, tex_data[2], tex_data[3], frac_x),851frac_y);852}853854/** Perform a color bit-cast operation855*856* For copy operations involving CCS, we may need to use different formats for857* the source and destination surfaces. The two formats must both be UINT858* formats and must have the same size but may have different bit layouts.859* For instance, we may be copying from R8G8B8A8_UINT to R32_UINT or R32_UINT860* to R16G16_UINT. This function generates code to shuffle bits around to get861* us from one to the other.862*/863static nir_ssa_def *864bit_cast_color(struct nir_builder *b, nir_ssa_def *color,865const struct brw_blorp_blit_prog_key *key)866{867if (key->src_format == key->dst_format)868return color;869870const struct isl_format_layout *src_fmtl =871isl_format_get_layout(key->src_format);872const struct isl_format_layout *dst_fmtl =873isl_format_get_layout(key->dst_format);874875/* They must be formats with the same bit size */876assert(src_fmtl->bpb == dst_fmtl->bpb);877878if (src_fmtl->bpb <= 32) {879assert(src_fmtl->channels.r.type == ISL_UINT ||880src_fmtl->channels.r.type == ISL_UNORM);881assert(dst_fmtl->channels.r.type == ISL_UINT ||882dst_fmtl->channels.r.type == ISL_UNORM);883884nir_ssa_def *packed = nir_imm_int(b, 0);885for (unsigned c = 0; c < 4; c++) {886if (src_fmtl->channels_array[c].bits == 0)887continue;888889const unsigned chan_start_bit = src_fmtl->channels_array[c].start_bit;890const unsigned chan_bits = src_fmtl->channels_array[c].bits;891892nir_ssa_def *chan = nir_channel(b, color, c);893if (src_fmtl->channels_array[c].type == ISL_UNORM)894chan = nir_format_float_to_unorm(b, chan, &chan_bits);895896packed = nir_ior(b, packed, nir_shift_imm(b, chan, chan_start_bit));897}898899nir_ssa_def *chans[4] = { };900for (unsigned c = 0; c < 4; c++) {901if (dst_fmtl->channels_array[c].bits == 0) {902chans[c] = nir_imm_int(b, 0);903continue;904}905906const unsigned chan_start_bit = dst_fmtl->channels_array[c].start_bit;907const unsigned chan_bits = dst_fmtl->channels_array[c].bits;908chans[c] = nir_iand(b, nir_shift_imm(b, packed, -(int)chan_start_bit),909nir_imm_int(b, BITFIELD_MASK(chan_bits)));910911if (dst_fmtl->channels_array[c].type == ISL_UNORM)912chans[c] = nir_format_unorm_to_float(b, chans[c], &chan_bits);913}914color = nir_vec(b, chans, 4);915} else {916/* This path only supports UINT formats */917assert(src_fmtl->channels.r.type == ISL_UINT);918assert(dst_fmtl->channels.r.type == ISL_UINT);919920const unsigned src_bpc = src_fmtl->channels.r.bits;921const unsigned dst_bpc = dst_fmtl->channels.r.bits;922923assert(src_fmtl->channels.g.bits == 0 ||924src_fmtl->channels.g.bits == src_fmtl->channels.r.bits);925assert(src_fmtl->channels.b.bits == 0 ||926src_fmtl->channels.b.bits == src_fmtl->channels.r.bits);927assert(src_fmtl->channels.a.bits == 0 ||928src_fmtl->channels.a.bits == src_fmtl->channels.r.bits);929assert(dst_fmtl->channels.g.bits == 0 ||930dst_fmtl->channels.g.bits == dst_fmtl->channels.r.bits);931assert(dst_fmtl->channels.b.bits == 0 ||932dst_fmtl->channels.b.bits == dst_fmtl->channels.r.bits);933assert(dst_fmtl->channels.a.bits == 0 ||934dst_fmtl->channels.a.bits == dst_fmtl->channels.r.bits);935936/* Restrict to only the channels we actually have */937const unsigned src_channels =938isl_format_get_num_channels(key->src_format);939color = nir_channels(b, color, (1 << src_channels) - 1);940941color = nir_format_bitcast_uvec_unmasked(b, color, src_bpc, dst_bpc);942}943944/* Blorp likes to assume that colors are vec4s */945nir_ssa_def *u = nir_ssa_undef(b, 1, 32);946nir_ssa_def *chans[4] = { u, u, u, u };947for (unsigned i = 0; i < color->num_components; i++)948chans[i] = nir_channel(b, color, i);949return nir_vec4(b, chans[0], chans[1], chans[2], chans[3]);950}951952static nir_ssa_def *953select_color_channel(struct nir_builder *b, nir_ssa_def *color,954nir_alu_type data_type,955enum isl_channel_select chan)956{957if (chan == ISL_CHANNEL_SELECT_ZERO) {958return nir_imm_int(b, 0);959} else if (chan == ISL_CHANNEL_SELECT_ONE) {960switch (data_type) {961case nir_type_int:962case nir_type_uint:963return nir_imm_int(b, 1);964case nir_type_float:965return nir_imm_float(b, 1);966default:967unreachable("Invalid data type");968}969} else {970assert((unsigned)(chan - ISL_CHANNEL_SELECT_RED) < 4);971return nir_channel(b, color, chan - ISL_CHANNEL_SELECT_RED);972}973}974975static nir_ssa_def *976swizzle_color(struct nir_builder *b, nir_ssa_def *color,977struct isl_swizzle swizzle, nir_alu_type data_type)978{979return nir_vec4(b,980select_color_channel(b, color, data_type, swizzle.r),981select_color_channel(b, color, data_type, swizzle.g),982select_color_channel(b, color, data_type, swizzle.b),983select_color_channel(b, color, data_type, swizzle.a));984}985986static nir_ssa_def *987convert_color(struct nir_builder *b, nir_ssa_def *color,988const struct brw_blorp_blit_prog_key *key)989{990/* All of our color conversions end up generating a single-channel color991* value that we need to write out.992*/993nir_ssa_def *value;994995if (key->dst_format == ISL_FORMAT_R24_UNORM_X8_TYPELESS) {996/* The destination image is bound as R32_UINT but the data needs to be997* in R24_UNORM_X8_TYPELESS. The bottom 24 are the actual data and the998* top 8 need to be zero. We can accomplish this by simply multiplying999* by a factor to scale things down.1000*/1001unsigned factor = (1 << 24) - 1;1002value = nir_fsat(b, nir_channel(b, color, 0));1003value = nir_f2i32(b, nir_fmul(b, value, nir_imm_float(b, factor)));1004} else if (key->dst_format == ISL_FORMAT_L8_UNORM_SRGB) {1005value = nir_format_linear_to_srgb(b, nir_channel(b, color, 0));1006} else if (key->dst_format == ISL_FORMAT_R8G8B8_UNORM_SRGB) {1007value = nir_format_linear_to_srgb(b, color);1008} else if (key->dst_format == ISL_FORMAT_R9G9B9E5_SHAREDEXP) {1009value = nir_format_pack_r9g9b9e5(b, color);1010} else {1011unreachable("Unsupported format conversion");1012}10131014nir_ssa_def *out_comps[4];1015for (unsigned i = 0; i < 4; i++) {1016if (i < value->num_components)1017out_comps[i] = nir_channel(b, value, i);1018else1019out_comps[i] = nir_ssa_undef(b, 1, 32);1020}1021return nir_vec(b, out_comps, 4);1022}10231024/**1025* Generator for WM programs used in BLORP blits.1026*1027* The bulk of the work done by the WM program is to wrap and unwrap the1028* coordinate transformations used by the hardware to store surfaces in1029* memory. The hardware transforms a pixel location (X, Y, S) (where S is the1030* sample index for a multisampled surface) to a memory offset by the1031* following formulas:1032*1033* offset = tile(tiling_format, encode_msaa(num_samples, layout, X, Y, S))1034* (X, Y, S) = decode_msaa(num_samples, layout, detile(tiling_format, offset))1035*1036* For a single-sampled surface, or for a multisampled surface using1037* INTEL_MSAA_LAYOUT_UMS, encode_msaa() and decode_msaa are the identity1038* function:1039*1040* encode_msaa(1, NONE, X, Y, 0) = (X, Y, 0)1041* decode_msaa(1, NONE, X, Y, 0) = (X, Y, 0)1042* encode_msaa(n, UMS, X, Y, S) = (X, Y, S)1043* decode_msaa(n, UMS, X, Y, S) = (X, Y, S)1044*1045* For a 4x multisampled surface using INTEL_MSAA_LAYOUT_IMS, encode_msaa()1046* embeds the sample number into bit 1 of the X and Y coordinates:1047*1048* encode_msaa(4, IMS, X, Y, S) = (X', Y', 0)1049* where X' = (X & ~0b1) << 1 | (S & 0b1) << 1 | (X & 0b1)1050* Y' = (Y & ~0b1 ) << 1 | (S & 0b10) | (Y & 0b1)1051* decode_msaa(4, IMS, X, Y, 0) = (X', Y', S)1052* where X' = (X & ~0b11) >> 1 | (X & 0b1)1053* Y' = (Y & ~0b11) >> 1 | (Y & 0b1)1054* S = (Y & 0b10) | (X & 0b10) >> 11055*1056* For an 8x multisampled surface using INTEL_MSAA_LAYOUT_IMS, encode_msaa()1057* embeds the sample number into bits 1 and 2 of the X coordinate and bit 1 of1058* the Y coordinate:1059*1060* encode_msaa(8, IMS, X, Y, S) = (X', Y', 0)1061* where X' = (X & ~0b1) << 2 | (S & 0b100) | (S & 0b1) << 1 | (X & 0b1)1062* Y' = (Y & ~0b1) << 1 | (S & 0b10) | (Y & 0b1)1063* decode_msaa(8, IMS, X, Y, 0) = (X', Y', S)1064* where X' = (X & ~0b111) >> 2 | (X & 0b1)1065* Y' = (Y & ~0b11) >> 1 | (Y & 0b1)1066* S = (X & 0b100) | (Y & 0b10) | (X & 0b10) >> 11067*1068* For X tiling, tile() combines together the low-order bits of the X and Y1069* coordinates in the pattern 0byyyxxxxxxxxx, creating 4k tiles that are 5121070* bytes wide and 8 rows high:1071*1072* tile(x_tiled, X, Y, S) = A1073* where A = tile_num << 12 | offset1074* tile_num = (Y' >> 3) * tile_pitch + (X' >> 9)1075* offset = (Y' & 0b111) << 91076* | (X & 0b111111111)1077* X' = X * cpp1078* Y' = Y + S * qpitch1079* detile(x_tiled, A) = (X, Y, S)1080* where X = X' / cpp1081* Y = Y' % qpitch1082* S = Y' / qpitch1083* Y' = (tile_num / tile_pitch) << 31084* | (A & 0b111000000000) >> 91085* X' = (tile_num % tile_pitch) << 91086* | (A & 0b111111111)1087*1088* (In all tiling formulas, cpp is the number of bytes occupied by a single1089* sample ("chars per pixel"), tile_pitch is the number of 4k tiles required1090* to fill the width of the surface, and qpitch is the spacing (in rows)1091* between array slices).1092*1093* For Y tiling, tile() combines together the low-order bits of the X and Y1094* coordinates in the pattern 0bxxxyyyyyxxxx, creating 4k tiles that are 1281095* bytes wide and 32 rows high:1096*1097* tile(y_tiled, X, Y, S) = A1098* where A = tile_num << 12 | offset1099* tile_num = (Y' >> 5) * tile_pitch + (X' >> 7)1100* offset = (X' & 0b1110000) << 51101* | (Y' & 0b11111) << 41102* | (X' & 0b1111)1103* X' = X * cpp1104* Y' = Y + S * qpitch1105* detile(y_tiled, A) = (X, Y, S)1106* where X = X' / cpp1107* Y = Y' % qpitch1108* S = Y' / qpitch1109* Y' = (tile_num / tile_pitch) << 51110* | (A & 0b111110000) >> 41111* X' = (tile_num % tile_pitch) << 71112* | (A & 0b111000000000) >> 51113* | (A & 0b1111)1114*1115* For W tiling, tile() combines together the low-order bits of the X and Y1116* coordinates in the pattern 0bxxxyyyyxyxyx, creating 4k tiles that are 641117* bytes wide and 64 rows high (note that W tiling is only used for stencil1118* buffers, which always have cpp = 1 and S=0):1119*1120* tile(w_tiled, X, Y, S) = A1121* where A = tile_num << 12 | offset1122* tile_num = (Y' >> 6) * tile_pitch + (X' >> 6)1123* offset = (X' & 0b111000) << 61124* | (Y' & 0b111100) << 31125* | (X' & 0b100) << 21126* | (Y' & 0b10) << 21127* | (X' & 0b10) << 11128* | (Y' & 0b1) << 11129* | (X' & 0b1)1130* X' = X * cpp = X1131* Y' = Y + S * qpitch1132* detile(w_tiled, A) = (X, Y, S)1133* where X = X' / cpp = X'1134* Y = Y' % qpitch = Y'1135* S = Y / qpitch = 01136* Y' = (tile_num / tile_pitch) << 61137* | (A & 0b111100000) >> 31138* | (A & 0b1000) >> 21139* | (A & 0b10) >> 11140* X' = (tile_num % tile_pitch) << 61141* | (A & 0b111000000000) >> 61142* | (A & 0b10000) >> 21143* | (A & 0b100) >> 11144* | (A & 0b1)1145*1146* Finally, for a non-tiled surface, tile() simply combines together the X and1147* Y coordinates in the natural way:1148*1149* tile(untiled, X, Y, S) = A1150* where A = Y * pitch + X'1151* X' = X * cpp1152* Y' = Y + S * qpitch1153* detile(untiled, A) = (X, Y, S)1154* where X = X' / cpp1155* Y = Y' % qpitch1156* S = Y' / qpitch1157* X' = A % pitch1158* Y' = A / pitch1159*1160* (In these formulas, pitch is the number of bytes occupied by a single row1161* of samples).1162*/1163static nir_shader *1164brw_blorp_build_nir_shader(struct blorp_context *blorp, void *mem_ctx,1165const struct brw_blorp_blit_prog_key *key)1166{1167const struct intel_device_info *devinfo = blorp->isl_dev->info;1168nir_ssa_def *src_pos, *dst_pos, *color;11691170/* Sanity checks */1171if (key->dst_tiled_w && key->rt_samples > 1) {1172/* If the destination image is W tiled and multisampled, then the thread1173* must be dispatched once per sample, not once per pixel. This is1174* necessary because after conversion between W and Y tiling, there's no1175* guarantee that all samples corresponding to a single pixel will still1176* be together.1177*/1178assert(key->persample_msaa_dispatch);1179}11801181if (key->persample_msaa_dispatch) {1182/* It only makes sense to do persample dispatch if the render target is1183* configured as multisampled.1184*/1185assert(key->rt_samples > 0);1186}11871188/* Make sure layout is consistent with sample count */1189assert((key->tex_layout == ISL_MSAA_LAYOUT_NONE) ==1190(key->tex_samples <= 1));1191assert((key->rt_layout == ISL_MSAA_LAYOUT_NONE) ==1192(key->rt_samples <= 1));1193assert((key->src_layout == ISL_MSAA_LAYOUT_NONE) ==1194(key->src_samples <= 1));1195assert((key->dst_layout == ISL_MSAA_LAYOUT_NONE) ==1196(key->dst_samples <= 1));11971198nir_builder b;1199blorp_nir_init_shader(&b, mem_ctx, MESA_SHADER_FRAGMENT, NULL);12001201struct brw_blorp_blit_vars v;1202brw_blorp_blit_vars_init(&b, &v, key);12031204dst_pos = blorp_blit_get_frag_coords(&b, key, &v);12051206/* Render target and texture hardware don't support W tiling until Gfx8. */1207const bool rt_tiled_w = false;1208const bool tex_tiled_w = devinfo->ver >= 8 && key->src_tiled_w;12091210/* The address that data will be written to is determined by the1211* coordinates supplied to the WM thread and the tiling and sample count of1212* the render target, according to the formula:1213*1214* (X, Y, S) = decode_msaa(rt_samples, detile(rt_tiling, offset))1215*1216* If the actual tiling and sample count of the destination surface are not1217* the same as the configuration of the render target, then these1218* coordinates are wrong and we have to adjust them to compensate for the1219* difference.1220*/1221if (rt_tiled_w != key->dst_tiled_w ||1222key->rt_samples != key->dst_samples ||1223key->rt_layout != key->dst_layout) {1224dst_pos = blorp_nir_encode_msaa(&b, dst_pos, key->rt_samples,1225key->rt_layout);1226/* Now (X, Y, S) = detile(rt_tiling, offset) */1227if (rt_tiled_w != key->dst_tiled_w)1228dst_pos = blorp_nir_retile_y_to_w(&b, dst_pos);1229/* Now (X, Y, S) = detile(rt_tiling, offset) */1230dst_pos = blorp_nir_decode_msaa(&b, dst_pos, key->dst_samples,1231key->dst_layout);1232}12331234nir_ssa_def *comp = NULL;1235if (key->dst_rgb) {1236/* The destination image is bound as a red texture three times as wide1237* as the actual image. Our shader is effectively running one color1238* component at a time. We need to save off the component and adjust1239* the destination position.1240*/1241assert(dst_pos->num_components == 2);1242nir_ssa_def *dst_x = nir_channel(&b, dst_pos, 0);1243comp = nir_umod(&b, dst_x, nir_imm_int(&b, 3));1244dst_pos = nir_vec2(&b, nir_idiv(&b, dst_x, nir_imm_int(&b, 3)),1245nir_channel(&b, dst_pos, 1));1246}12471248/* Now (X, Y, S) = decode_msaa(dst_samples, detile(dst_tiling, offset)).1249*1250* That is: X, Y and S now contain the true coordinates and sample index of1251* the data that the WM thread should output.1252*1253* If we need to kill pixels that are outside the destination rectangle,1254* now is the time to do it.1255*/1256if (key->use_kill)1257blorp_nir_discard_if_outside_rect(&b, dst_pos, &v);12581259src_pos = blorp_blit_apply_transform(&b, nir_i2f32(&b, dst_pos), &v);1260if (dst_pos->num_components == 3) {1261/* The sample coordinate is an integer that we want left alone but1262* blorp_blit_apply_transform() blindly applies the transform to all1263* three coordinates. Grab the original sample index.1264*/1265src_pos = nir_vec3(&b, nir_channel(&b, src_pos, 0),1266nir_channel(&b, src_pos, 1),1267nir_channel(&b, dst_pos, 2));1268}12691270/* If the source image is not multisampled, then we want to fetch sample1271* number 0, because that's the only sample there is.1272*/1273if (key->src_samples == 1)1274src_pos = nir_channels(&b, src_pos, 0x3);12751276/* X, Y, and S are now the coordinates of the pixel in the source image1277* that we want to texture from. Exception: if we are blending, then S is1278* irrelevant, because we are going to fetch all samples.1279*/1280switch (key->filter) {1281case BLORP_FILTER_NONE:1282case BLORP_FILTER_NEAREST:1283case BLORP_FILTER_SAMPLE_0:1284/* We're going to use texelFetch, so we need integers */1285if (src_pos->num_components == 2) {1286src_pos = nir_f2i32(&b, src_pos);1287} else {1288assert(src_pos->num_components == 3);1289src_pos = nir_vec3(&b, nir_channel(&b, nir_f2i32(&b, src_pos), 0),1290nir_channel(&b, nir_f2i32(&b, src_pos), 1),1291nir_channel(&b, src_pos, 2));1292}12931294/* We aren't blending, which means we just want to fetch a single1295* sample from the source surface. The address that we want to fetch1296* from is related to the X, Y and S values according to the formula:1297*1298* (X, Y, S) = decode_msaa(src_samples, detile(src_tiling, offset)).1299*1300* If the actual tiling and sample count of the source surface are1301* not the same as the configuration of the texture, then we need to1302* adjust the coordinates to compensate for the difference.1303*/1304if (tex_tiled_w != key->src_tiled_w ||1305key->tex_samples != key->src_samples ||1306key->tex_layout != key->src_layout) {1307src_pos = blorp_nir_encode_msaa(&b, src_pos, key->src_samples,1308key->src_layout);1309/* Now (X, Y, S) = detile(src_tiling, offset) */1310if (tex_tiled_w != key->src_tiled_w)1311src_pos = blorp_nir_retile_w_to_y(&b, src_pos);1312/* Now (X, Y, S) = detile(tex_tiling, offset) */1313src_pos = blorp_nir_decode_msaa(&b, src_pos, key->tex_samples,1314key->tex_layout);1315}13161317if (key->need_src_offset)1318src_pos = nir_iadd(&b, src_pos, nir_load_var(&b, v.v_src_offset));13191320/* Now (X, Y, S) = decode_msaa(tex_samples, detile(tex_tiling, offset)).1321*1322* In other words: X, Y, and S now contain values which, when passed to1323* the texturing unit, will cause data to be read from the correct1324* memory location. So we can fetch the texel now.1325*/1326if (key->src_samples == 1) {1327color = blorp_nir_txf(&b, &v, src_pos, key->texture_data_type);1328} else {1329nir_ssa_def *mcs = NULL;1330if (isl_aux_usage_has_mcs(key->tex_aux_usage))1331mcs = blorp_blit_txf_ms_mcs(&b, &v, src_pos);13321333color = blorp_nir_txf_ms(&b, &v, src_pos, mcs, key->texture_data_type);1334}1335break;13361337case BLORP_FILTER_BILINEAR:1338assert(!key->src_tiled_w);1339assert(key->tex_samples == key->src_samples);1340assert(key->tex_layout == key->src_layout);13411342if (key->src_samples == 1) {1343color = blorp_nir_tex(&b, &v, key, src_pos);1344} else {1345assert(!key->use_kill);1346color = blorp_nir_manual_blend_bilinear(&b, src_pos, key->src_samples,1347key, &v);1348}1349break;13501351case BLORP_FILTER_AVERAGE:1352case BLORP_FILTER_MIN_SAMPLE:1353case BLORP_FILTER_MAX_SAMPLE:1354assert(!key->src_tiled_w);1355assert(key->tex_samples == key->src_samples);1356assert(key->tex_layout == key->src_layout);13571358/* Resolves (effecively) use texelFetch, so we need integers and we1359* don't care about the sample index if we got one.1360*/1361src_pos = nir_f2i32(&b, nir_channels(&b, src_pos, 0x3));13621363if (devinfo->ver == 6) {1364/* Because gfx6 only supports 4x interleved MSAA, we can do all the1365* blending we need with a single linear-interpolated texture lookup1366* at the center of the sample. The texture coordinates to be odd1367* integers so that they correspond to the center of a 2x2 block1368* representing the four samples that maxe up a pixel. So we need1369* to multiply our X and Y coordinates each by 2 and then add 1.1370*/1371assert(key->src_coords_normalized);1372assert(key->filter == BLORP_FILTER_AVERAGE);1373src_pos = nir_fadd(&b,1374nir_i2f32(&b, src_pos),1375nir_imm_float(&b, 0.5f));1376color = blorp_nir_tex(&b, &v, key, src_pos);1377} else {1378/* Gfx7+ hardware doesn't automaticaly blend. */1379color = blorp_nir_combine_samples(&b, &v, src_pos, key->src_samples,1380key->tex_aux_usage,1381key->texture_data_type,1382key->filter);1383}1384break;13851386default:1387unreachable("Invalid blorp filter");1388}13891390if (!isl_swizzle_is_identity(key->src_swizzle)) {1391color = swizzle_color(&b, color, key->src_swizzle,1392key->texture_data_type);1393}13941395if (!isl_swizzle_is_identity(key->dst_swizzle)) {1396color = swizzle_color(&b, color, isl_swizzle_invert(key->dst_swizzle),1397nir_type_int);1398}13991400if (key->format_bit_cast) {1401assert(isl_swizzle_is_identity(key->src_swizzle));1402assert(isl_swizzle_is_identity(key->dst_swizzle));1403color = bit_cast_color(&b, color, key);1404} else if (key->dst_format) {1405color = convert_color(&b, color, key);1406} else if (key->uint32_to_sint) {1407/* Normally the hardware will take care of converting values from/to1408* the source and destination formats. But a few cases need help.1409*1410* The Skylake PRM, volume 07, page 658 has a programming note:1411*1412* "When using SINT or UINT rendertarget surface formats, Blending1413* must be DISABLED. The Pre-Blend Color Clamp Enable and Color1414* Clamp Range fields are ignored, and an implied clamp to the1415* rendertarget surface format is performed."1416*1417* For UINT to SINT blits, our sample operation gives us a uint32_t,1418* but our render target write expects a signed int32_t number. If we1419* simply passed the value along, the hardware would interpret a value1420* with bit 31 set as a negative value, clamping it to the largest1421* negative number the destination format could represent. But the1422* actual source value is a positive number, so we want to clamp it1423* to INT_MAX. To fix this, we explicitly take min(color, INT_MAX).1424*/1425color = nir_umin(&b, color, nir_imm_int(&b, INT32_MAX));1426} else if (key->sint32_to_uint) {1427/* Similar to above, but clamping negative numbers to zero. */1428color = nir_imax(&b, color, nir_imm_int(&b, 0));1429}14301431if (key->dst_rgb) {1432/* The destination image is bound as a red texture three times as wide1433* as the actual image. Our shader is effectively running one color1434* component at a time. We need to pick off the appropriate component1435* from the source color and write that to destination red.1436*/1437assert(dst_pos->num_components == 2);14381439nir_ssa_def *color_component =1440nir_bcsel(&b, nir_ieq_imm(&b, comp, 0),1441nir_channel(&b, color, 0),1442nir_bcsel(&b, nir_ieq_imm(&b, comp, 1),1443nir_channel(&b, color, 1),1444nir_channel(&b, color, 2)));14451446nir_ssa_def *u = nir_ssa_undef(&b, 1, 32);1447color = nir_vec4(&b, color_component, u, u, u);1448}14491450if (key->dst_usage == ISL_SURF_USAGE_RENDER_TARGET_BIT) {1451nir_variable *color_out =1452nir_variable_create(b.shader, nir_var_shader_out,1453glsl_vec4_type(), "gl_FragColor");1454color_out->data.location = FRAG_RESULT_COLOR;1455nir_store_var(&b, color_out, color, 0xf);1456} else if (key->dst_usage == ISL_SURF_USAGE_DEPTH_BIT) {1457nir_variable *depth_out =1458nir_variable_create(b.shader, nir_var_shader_out,1459glsl_float_type(), "gl_FragDepth");1460depth_out->data.location = FRAG_RESULT_DEPTH;1461nir_store_var(&b, depth_out, nir_channel(&b, color, 0), 0x1);1462} else if (key->dst_usage == ISL_SURF_USAGE_STENCIL_BIT) {1463nir_variable *stencil_out =1464nir_variable_create(b.shader, nir_var_shader_out,1465glsl_int_type(), "gl_FragStencilRef");1466stencil_out->data.location = FRAG_RESULT_STENCIL;1467nir_store_var(&b, stencil_out, nir_channel(&b, color, 0), 0x1);1468} else {1469unreachable("Invalid destination usage");1470}14711472return b.shader;1473}14741475static bool1476brw_blorp_get_blit_kernel(struct blorp_batch *batch,1477struct blorp_params *params,1478const struct brw_blorp_blit_prog_key *prog_key)1479{1480struct blorp_context *blorp = batch->blorp;14811482if (blorp->lookup_shader(batch, prog_key, sizeof(*prog_key),1483¶ms->wm_prog_kernel, ¶ms->wm_prog_data))1484return true;14851486void *mem_ctx = ralloc_context(NULL);14871488const unsigned *program;1489struct brw_wm_prog_data prog_data;14901491nir_shader *nir = brw_blorp_build_nir_shader(blorp, mem_ctx, prog_key);1492nir->info.name =1493ralloc_strdup(nir, blorp_shader_type_to_name(prog_key->base.shader_type));14941495struct brw_wm_prog_key wm_key;1496brw_blorp_init_wm_prog_key(&wm_key);1497wm_key.base.tex.compressed_multisample_layout_mask =1498isl_aux_usage_has_mcs(prog_key->tex_aux_usage);1499wm_key.base.tex.msaa_16 = prog_key->tex_samples == 16;1500wm_key.multisample_fbo = prog_key->rt_samples > 1;15011502program = blorp_compile_fs(blorp, mem_ctx, nir, &wm_key, false,1503&prog_data);15041505bool result =1506blorp->upload_shader(batch, MESA_SHADER_FRAGMENT,1507prog_key, sizeof(*prog_key),1508program, prog_data.base.program_size,1509&prog_data.base, sizeof(prog_data),1510¶ms->wm_prog_kernel, ¶ms->wm_prog_data);15111512ralloc_free(mem_ctx);1513return result;1514}15151516static void1517brw_blorp_setup_coord_transform(struct brw_blorp_coord_transform *xform,1518GLfloat src0, GLfloat src1,1519GLfloat dst0, GLfloat dst1,1520bool mirror)1521{1522double scale = (double)(src1 - src0) / (double)(dst1 - dst0);1523if (!mirror) {1524/* When not mirroring a coordinate (say, X), we need:1525* src_x - src_x0 = (dst_x - dst_x0 + 0.5) * scale1526* Therefore:1527* src_x = src_x0 + (dst_x - dst_x0 + 0.5) * scale1528*1529* blorp program uses "round toward zero" to convert the1530* transformed floating point coordinates to integer coordinates,1531* whereas the behaviour we actually want is "round to nearest",1532* so 0.5 provides the necessary correction.1533*/1534xform->multiplier = scale;1535xform->offset = src0 + (-(double)dst0 + 0.5) * scale;1536} else {1537/* When mirroring X we need:1538* src_x - src_x0 = dst_x1 - dst_x - 0.51539* Therefore:1540* src_x = src_x0 + (dst_x1 -dst_x - 0.5) * scale1541*/1542xform->multiplier = -scale;1543xform->offset = src0 + ((double)dst1 - 0.5) * scale;1544}1545}15461547static inline void1548surf_get_intratile_offset_px(struct brw_blorp_surface_info *info,1549uint32_t *tile_x_px, uint32_t *tile_y_px)1550{1551if (info->surf.msaa_layout == ISL_MSAA_LAYOUT_INTERLEAVED) {1552struct isl_extent2d px_size_sa =1553isl_get_interleaved_msaa_px_size_sa(info->surf.samples);1554assert(info->tile_x_sa % px_size_sa.width == 0);1555assert(info->tile_y_sa % px_size_sa.height == 0);1556*tile_x_px = info->tile_x_sa / px_size_sa.width;1557*tile_y_px = info->tile_y_sa / px_size_sa.height;1558} else {1559*tile_x_px = info->tile_x_sa;1560*tile_y_px = info->tile_y_sa;1561}1562}15631564void1565blorp_surf_convert_to_single_slice(const struct isl_device *isl_dev,1566struct brw_blorp_surface_info *info)1567{1568bool ok UNUSED;15691570/* It would be insane to try and do this on a compressed surface */1571assert(info->aux_usage == ISL_AUX_USAGE_NONE);15721573/* Just bail if we have nothing to do. */1574if (info->surf.dim == ISL_SURF_DIM_2D &&1575info->view.base_level == 0 && info->view.base_array_layer == 0 &&1576info->surf.levels == 1 && info->surf.logical_level0_px.array_len == 1)1577return;15781579/* If this gets triggered then we've gotten here twice which. This1580* shouldn't happen thanks to the above early return.1581*/1582assert(info->tile_x_sa == 0 && info->tile_y_sa == 0);15831584uint32_t layer = 0, z = 0;1585if (info->surf.dim == ISL_SURF_DIM_3D)1586z = info->view.base_array_layer + info->z_offset;1587else1588layer = info->view.base_array_layer;15891590uint32_t byte_offset;1591isl_surf_get_image_surf(isl_dev, &info->surf,1592info->view.base_level, layer, z,1593&info->surf,1594&byte_offset, &info->tile_x_sa, &info->tile_y_sa);1595info->addr.offset += byte_offset;15961597uint32_t tile_x_px, tile_y_px;1598surf_get_intratile_offset_px(info, &tile_x_px, &tile_y_px);15991600/* Instead of using the X/Y Offset fields in RENDER_SURFACE_STATE, we place1601* the image at the tile boundary and offset our sampling or rendering.1602* For this reason, we need to grow the image by the offset to ensure that1603* the hardware doesn't think we've gone past the edge.1604*/1605info->surf.logical_level0_px.w += tile_x_px;1606info->surf.logical_level0_px.h += tile_y_px;1607info->surf.phys_level0_sa.w += info->tile_x_sa;1608info->surf.phys_level0_sa.h += info->tile_y_sa;16091610/* The view is also different now. */1611info->view.base_level = 0;1612info->view.levels = 1;1613info->view.base_array_layer = 0;1614info->view.array_len = 1;1615info->z_offset = 0;1616}16171618void1619blorp_surf_fake_interleaved_msaa(const struct isl_device *isl_dev,1620struct brw_blorp_surface_info *info)1621{1622assert(info->surf.msaa_layout == ISL_MSAA_LAYOUT_INTERLEAVED);16231624/* First, we need to convert it to a simple 1-level 1-layer 2-D surface */1625blorp_surf_convert_to_single_slice(isl_dev, info);16261627info->surf.logical_level0_px = info->surf.phys_level0_sa;1628info->surf.samples = 1;1629info->surf.msaa_layout = ISL_MSAA_LAYOUT_NONE;1630}16311632void1633blorp_surf_retile_w_to_y(const struct isl_device *isl_dev,1634struct brw_blorp_surface_info *info)1635{1636assert(info->surf.tiling == ISL_TILING_W);16371638/* First, we need to convert it to a simple 1-level 1-layer 2-D surface */1639blorp_surf_convert_to_single_slice(isl_dev, info);16401641/* On gfx7+, we don't have interleaved multisampling for color render1642* targets so we have to fake it.1643*1644* TODO: Are we sure we don't also need to fake it on gfx6?1645*/1646if (isl_dev->info->ver > 6 &&1647info->surf.msaa_layout == ISL_MSAA_LAYOUT_INTERLEAVED) {1648blorp_surf_fake_interleaved_msaa(isl_dev, info);1649}16501651if (isl_dev->info->ver == 6) {1652/* Gfx6 stencil buffers have a very large alignment coming in from the1653* miptree. It's out-of-bounds for what the surface state can handle.1654* Since we have a single layer and level, it doesn't really matter as1655* long as we don't pass a bogus value into isl_surf_fill_state().1656*/1657info->surf.image_alignment_el = isl_extent3d(4, 2, 1);1658}16591660/* Now that we've converted everything to a simple 2-D surface with only1661* one miplevel, we can go about retiling it.1662*/1663const unsigned x_align = 8, y_align = info->surf.samples != 0 ? 8 : 4;1664info->surf.tiling = ISL_TILING_Y0;1665info->surf.logical_level0_px.width =1666ALIGN(info->surf.logical_level0_px.width, x_align) * 2;1667info->surf.logical_level0_px.height =1668ALIGN(info->surf.logical_level0_px.height, y_align) / 2;1669info->tile_x_sa *= 2;1670info->tile_y_sa /= 2;1671}16721673static bool1674can_shrink_surface(const struct brw_blorp_surface_info *surf)1675{1676/* The current code doesn't support offsets into the aux buffers. This1677* should be possible, but we need to make sure the offset is page1678* aligned for both the surface and the aux buffer surface. Generally1679* this mean using the page aligned offset for the aux buffer.1680*1681* Currently the cases where we must split the blit are limited to cases1682* where we don't have a aux buffer.1683*/1684if (surf->aux_addr.buffer != NULL)1685return false;16861687/* We can't support splitting the blit for gen <= 7, because the qpitch1688* size is calculated by the hardware based on the surface height for1689* gen <= 7. In gen >= 8, the qpitch is controlled by the driver.1690*/1691if (surf->surf.msaa_layout == ISL_MSAA_LAYOUT_ARRAY)1692return false;16931694return true;1695}16961697static unsigned1698get_max_surface_size(const struct intel_device_info *devinfo,1699const struct brw_blorp_surface_info *surf)1700{1701const unsigned max = devinfo->ver >= 7 ? 16384 : 8192;1702if (split_blorp_blit_debug && can_shrink_surface(surf))1703return max >> 4; /* A smaller restriction when debug is enabled */1704else1705return max;1706}17071708struct blt_axis {1709double src0, src1, dst0, dst1;1710bool mirror;1711};17121713struct blt_coords {1714struct blt_axis x, y;1715};17161717static enum isl_format1718get_red_format_for_rgb_format(enum isl_format format)1719{1720const struct isl_format_layout *fmtl = isl_format_get_layout(format);17211722switch (fmtl->channels.r.bits) {1723case 8:1724switch (fmtl->channels.r.type) {1725case ISL_UNORM:1726return ISL_FORMAT_R8_UNORM;1727case ISL_SNORM:1728return ISL_FORMAT_R8_SNORM;1729case ISL_UINT:1730return ISL_FORMAT_R8_UINT;1731case ISL_SINT:1732return ISL_FORMAT_R8_SINT;1733default:1734unreachable("Invalid 8-bit RGB channel type");1735}1736case 16:1737switch (fmtl->channels.r.type) {1738case ISL_UNORM:1739return ISL_FORMAT_R16_UNORM;1740case ISL_SNORM:1741return ISL_FORMAT_R16_SNORM;1742case ISL_SFLOAT:1743return ISL_FORMAT_R16_FLOAT;1744case ISL_UINT:1745return ISL_FORMAT_R16_UINT;1746case ISL_SINT:1747return ISL_FORMAT_R16_SINT;1748default:1749unreachable("Invalid 8-bit RGB channel type");1750}1751case 32:1752switch (fmtl->channels.r.type) {1753case ISL_SFLOAT:1754return ISL_FORMAT_R32_FLOAT;1755case ISL_UINT:1756return ISL_FORMAT_R32_UINT;1757case ISL_SINT:1758return ISL_FORMAT_R32_SINT;1759default:1760unreachable("Invalid 8-bit RGB channel type");1761}1762default:1763unreachable("Invalid number of red channel bits");1764}1765}17661767void1768surf_fake_rgb_with_red(const struct isl_device *isl_dev,1769struct brw_blorp_surface_info *info)1770{1771blorp_surf_convert_to_single_slice(isl_dev, info);17721773info->surf.logical_level0_px.width *= 3;1774info->surf.phys_level0_sa.width *= 3;1775info->tile_x_sa *= 3;17761777enum isl_format red_format =1778get_red_format_for_rgb_format(info->view.format);17791780assert(isl_format_get_layout(red_format)->channels.r.type ==1781isl_format_get_layout(info->view.format)->channels.r.type);1782assert(isl_format_get_layout(red_format)->channels.r.bits ==1783isl_format_get_layout(info->view.format)->channels.r.bits);17841785info->surf.format = info->view.format = red_format;1786}17871788enum blit_shrink_status {1789BLIT_NO_SHRINK = 0,1790BLIT_SRC_WIDTH_SHRINK = (1 << 0),1791BLIT_DST_WIDTH_SHRINK = (1 << 1),1792BLIT_SRC_HEIGHT_SHRINK = (1 << 2),1793BLIT_DST_HEIGHT_SHRINK = (1 << 3),1794};17951796/* Try to blit. If the surface parameters exceed the size allowed by hardware,1797* then enum blit_shrink_status will be returned. If BLIT_NO_SHRINK is1798* returned, then the blit was successful.1799*/1800static enum blit_shrink_status1801try_blorp_blit(struct blorp_batch *batch,1802struct blorp_params *params,1803struct brw_blorp_blit_prog_key *wm_prog_key,1804struct blt_coords *coords)1805{1806const struct intel_device_info *devinfo = batch->blorp->isl_dev->info;18071808if (params->dst.surf.usage & ISL_SURF_USAGE_DEPTH_BIT) {1809if (devinfo->ver >= 7) {1810/* We can render as depth on Gfx5 but there's no real advantage since1811* it doesn't support MSAA or HiZ. On Gfx4, we can't always render1812* to depth due to issues with depth buffers and mip-mapping. On1813* Gfx6, we can do everything but we have weird offsetting for HiZ1814* and stencil. It's easier to just render using the color pipe1815* on those platforms.1816*/1817wm_prog_key->dst_usage = ISL_SURF_USAGE_DEPTH_BIT;1818} else {1819wm_prog_key->dst_usage = ISL_SURF_USAGE_RENDER_TARGET_BIT;1820}1821} else if (params->dst.surf.usage & ISL_SURF_USAGE_STENCIL_BIT) {1822assert(params->dst.surf.format == ISL_FORMAT_R8_UINT);1823if (devinfo->ver >= 9) {1824wm_prog_key->dst_usage = ISL_SURF_USAGE_STENCIL_BIT;1825} else {1826wm_prog_key->dst_usage = ISL_SURF_USAGE_RENDER_TARGET_BIT;1827}1828} else {1829wm_prog_key->dst_usage = ISL_SURF_USAGE_RENDER_TARGET_BIT;1830}18311832if (isl_format_has_sint_channel(params->src.view.format)) {1833wm_prog_key->texture_data_type = nir_type_int;1834} else if (isl_format_has_uint_channel(params->src.view.format)) {1835wm_prog_key->texture_data_type = nir_type_uint;1836} else {1837wm_prog_key->texture_data_type = nir_type_float;1838}18391840/* src_samples and dst_samples are the true sample counts */1841wm_prog_key->src_samples = params->src.surf.samples;1842wm_prog_key->dst_samples = params->dst.surf.samples;18431844wm_prog_key->tex_aux_usage = params->src.aux_usage;18451846/* src_layout and dst_layout indicate the true MSAA layout used by src and1847* dst.1848*/1849wm_prog_key->src_layout = params->src.surf.msaa_layout;1850wm_prog_key->dst_layout = params->dst.surf.msaa_layout;18511852/* Round floating point values to nearest integer to avoid "off by one texel"1853* kind of errors when blitting.1854*/1855params->x0 = params->wm_inputs.discard_rect.x0 = round(coords->x.dst0);1856params->y0 = params->wm_inputs.discard_rect.y0 = round(coords->y.dst0);1857params->x1 = params->wm_inputs.discard_rect.x1 = round(coords->x.dst1);1858params->y1 = params->wm_inputs.discard_rect.y1 = round(coords->y.dst1);18591860brw_blorp_setup_coord_transform(¶ms->wm_inputs.coord_transform[0],1861coords->x.src0, coords->x.src1,1862coords->x.dst0, coords->x.dst1,1863coords->x.mirror);1864brw_blorp_setup_coord_transform(¶ms->wm_inputs.coord_transform[1],1865coords->y.src0, coords->y.src1,1866coords->y.dst0, coords->y.dst1,1867coords->y.mirror);186818691870if (devinfo->ver == 4) {1871/* The MinLOD and MinimumArrayElement don't work properly for cube maps.1872* Convert them to a single slice on gfx4.1873*/1874if (params->dst.surf.usage & ISL_SURF_USAGE_CUBE_BIT) {1875blorp_surf_convert_to_single_slice(batch->blorp->isl_dev, ¶ms->dst);1876wm_prog_key->need_dst_offset = true;1877}18781879if (params->src.surf.usage & ISL_SURF_USAGE_CUBE_BIT) {1880blorp_surf_convert_to_single_slice(batch->blorp->isl_dev, ¶ms->src);1881wm_prog_key->need_src_offset = true;1882}1883}18841885if (devinfo->ver > 6 &&1886!isl_surf_usage_is_depth_or_stencil(wm_prog_key->dst_usage) &&1887params->dst.surf.msaa_layout == ISL_MSAA_LAYOUT_INTERLEAVED) {1888assert(params->dst.surf.samples > 1);18891890/* We must expand the rectangle we send through the rendering pipeline,1891* to account for the fact that we are mapping the destination region as1892* single-sampled when it is in fact multisampled. We must also align1893* it to a multiple of the multisampling pattern, because the1894* differences between multisampled and single-sampled surface formats1895* will mean that pixels are scrambled within the multisampling pattern.1896* TODO: what if this makes the coordinates too large?1897*1898* Note: this only works if the destination surface uses the IMS layout.1899* If it's UMS, then we have no choice but to set up the rendering1900* pipeline as multisampled.1901*/1902struct isl_extent2d px_size_sa =1903isl_get_interleaved_msaa_px_size_sa(params->dst.surf.samples);1904params->x0 = ROUND_DOWN_TO(params->x0, 2) * px_size_sa.width;1905params->y0 = ROUND_DOWN_TO(params->y0, 2) * px_size_sa.height;1906params->x1 = ALIGN(params->x1, 2) * px_size_sa.width;1907params->y1 = ALIGN(params->y1, 2) * px_size_sa.height;19081909blorp_surf_fake_interleaved_msaa(batch->blorp->isl_dev, ¶ms->dst);19101911wm_prog_key->use_kill = true;1912wm_prog_key->need_dst_offset = true;1913}19141915if (params->dst.surf.tiling == ISL_TILING_W &&1916wm_prog_key->dst_usage != ISL_SURF_USAGE_STENCIL_BIT) {1917/* We must modify the rectangle we send through the rendering pipeline1918* (and the size and x/y offset of the destination surface), to account1919* for the fact that we are mapping it as Y-tiled when it is in fact1920* W-tiled.1921*1922* Both Y tiling and W tiling can be understood as organizations of1923* 32-byte sub-tiles; within each 32-byte sub-tile, the layout of pixels1924* is different, but the layout of the 32-byte sub-tiles within the 4k1925* tile is the same (8 sub-tiles across by 16 sub-tiles down, in1926* column-major order). In Y tiling, the sub-tiles are 16 bytes wide1927* and 2 rows high; in W tiling, they are 8 bytes wide and 4 rows high.1928*1929* Therefore, to account for the layout differences within the 32-byte1930* sub-tiles, we must expand the rectangle so the X coordinates of its1931* edges are multiples of 8 (the W sub-tile width), and its Y1932* coordinates of its edges are multiples of 4 (the W sub-tile height).1933* Then we need to scale the X and Y coordinates of the rectangle to1934* account for the differences in aspect ratio between the Y and W1935* sub-tiles. We need to modify the layer width and height similarly.1936*1937* A correction needs to be applied when MSAA is in use: since1938* INTEL_MSAA_LAYOUT_IMS uses an interleaving pattern whose height is 4,1939* we need to align the Y coordinates to multiples of 8, so that when1940* they are divided by two they are still multiples of 4.1941*1942* Note: Since the x/y offset of the surface will be applied using the1943* SURFACE_STATE command packet, it will be invisible to the swizzling1944* code in the shader; therefore it needs to be in a multiple of the1945* 32-byte sub-tile size. Fortunately it is, since the sub-tile is 81946* pixels wide and 4 pixels high (when viewed as a W-tiled stencil1947* buffer), and the miplevel alignment used for stencil buffers is 81948* pixels horizontally and either 4 or 8 pixels vertically (see1949* intel_horizontal_texture_alignment_unit() and1950* intel_vertical_texture_alignment_unit()).1951*1952* Note: Also, since the SURFACE_STATE command packet can only apply1953* offsets that are multiples of 4 pixels horizontally and 2 pixels1954* vertically, it is important that the offsets will be multiples of1955* these sizes after they are converted into Y-tiled coordinates.1956* Fortunately they will be, since we know from above that the offsets1957* are a multiple of the 32-byte sub-tile size, and in Y-tiled1958* coordinates the sub-tile is 16 pixels wide and 2 pixels high.1959*1960* TODO: what if this makes the coordinates (or the texture size) too1961* large?1962*/1963const unsigned x_align = 8;1964const unsigned y_align = params->dst.surf.samples != 0 ? 8 : 4;1965params->x0 = ROUND_DOWN_TO(params->x0, x_align) * 2;1966params->y0 = ROUND_DOWN_TO(params->y0, y_align) / 2;1967params->x1 = ALIGN(params->x1, x_align) * 2;1968params->y1 = ALIGN(params->y1, y_align) / 2;19691970/* Retile the surface to Y-tiled */1971blorp_surf_retile_w_to_y(batch->blorp->isl_dev, ¶ms->dst);19721973wm_prog_key->dst_tiled_w = true;1974wm_prog_key->use_kill = true;1975wm_prog_key->need_dst_offset = true;19761977if (params->dst.surf.samples > 1) {1978/* If the destination surface is a W-tiled multisampled stencil1979* buffer that we're mapping as Y tiled, then we need to arrange for1980* the WM program to run once per sample rather than once per pixel,1981* because the memory layout of related samples doesn't match between1982* W and Y tiling.1983*/1984wm_prog_key->persample_msaa_dispatch = true;1985}1986}19871988if (devinfo->ver < 8 && params->src.surf.tiling == ISL_TILING_W) {1989/* On Haswell and earlier, we have to fake W-tiled sources as Y-tiled.1990* Broadwell adds support for sampling from stencil.1991*1992* See the comments above concerning x/y offset alignment for the1993* destination surface.1994*1995* TODO: what if this makes the texture size too large?1996*/1997blorp_surf_retile_w_to_y(batch->blorp->isl_dev, ¶ms->src);19981999wm_prog_key->src_tiled_w = true;2000wm_prog_key->need_src_offset = true;2001}20022003/* tex_samples and rt_samples are the sample counts that are set up in2004* SURFACE_STATE.2005*/2006wm_prog_key->tex_samples = params->src.surf.samples;2007wm_prog_key->rt_samples = params->dst.surf.samples;20082009/* tex_layout and rt_layout indicate the MSAA layout the GPU pipeline will2010* use to access the source and destination surfaces.2011*/2012wm_prog_key->tex_layout = params->src.surf.msaa_layout;2013wm_prog_key->rt_layout = params->dst.surf.msaa_layout;20142015if (params->src.surf.samples > 0 && params->dst.surf.samples > 1) {2016/* We are blitting from a multisample buffer to a multisample buffer, so2017* we must preserve samples within a pixel. This means we have to2018* arrange for the WM program to run once per sample rather than once2019* per pixel.2020*/2021wm_prog_key->persample_msaa_dispatch = true;2022}20232024params->num_samples = params->dst.surf.samples;20252026if ((wm_prog_key->filter == BLORP_FILTER_AVERAGE ||2027wm_prog_key->filter == BLORP_FILTER_BILINEAR) &&2028batch->blorp->isl_dev->info->ver <= 6) {2029/* Gfx4-5 don't support non-normalized texture coordinates */2030wm_prog_key->src_coords_normalized = true;2031params->wm_inputs.src_inv_size[0] =20321.0f / minify(params->src.surf.logical_level0_px.width,2033params->src.view.base_level);2034params->wm_inputs.src_inv_size[1] =20351.0f / minify(params->src.surf.logical_level0_px.height,2036params->src.view.base_level);2037}20382039if (isl_format_get_layout(params->dst.view.format)->bpb % 3 == 0) {2040/* We can't render to RGB formats natively because they aren't a2041* power-of-two size. Instead, we fake them by using a red format2042* with the same channel type and size and emitting shader code to2043* only write one channel at a time.2044*/2045params->x0 *= 3;2046params->x1 *= 3;20472048/* If it happens to be sRGB, we need to force a conversion */2049if (params->dst.view.format == ISL_FORMAT_R8G8B8_UNORM_SRGB)2050wm_prog_key->dst_format = ISL_FORMAT_R8G8B8_UNORM_SRGB;20512052surf_fake_rgb_with_red(batch->blorp->isl_dev, ¶ms->dst);20532054wm_prog_key->dst_rgb = true;2055wm_prog_key->need_dst_offset = true;2056} else if (isl_format_is_rgbx(params->dst.view.format)) {2057/* We can handle RGBX formats easily enough by treating them as RGBA */2058params->dst.view.format =2059isl_format_rgbx_to_rgba(params->dst.view.format);2060} else if (params->dst.view.format == ISL_FORMAT_R24_UNORM_X8_TYPELESS &&2061wm_prog_key->dst_usage != ISL_SURF_USAGE_DEPTH_BIT) {2062wm_prog_key->dst_format = params->dst.view.format;2063params->dst.view.format = ISL_FORMAT_R32_UINT;2064} else if (params->dst.view.format == ISL_FORMAT_A4B4G4R4_UNORM) {2065params->dst.view.swizzle =2066isl_swizzle_compose(params->dst.view.swizzle,2067ISL_SWIZZLE(ALPHA, RED, GREEN, BLUE));2068params->dst.view.format = ISL_FORMAT_B4G4R4A4_UNORM;2069} else if (params->dst.view.format == ISL_FORMAT_L8_UNORM_SRGB) {2070wm_prog_key->dst_format = params->dst.view.format;2071params->dst.view.format = ISL_FORMAT_R8_UNORM;2072} else if (params->dst.view.format == ISL_FORMAT_R9G9B9E5_SHAREDEXP) {2073wm_prog_key->dst_format = params->dst.view.format;2074params->dst.view.format = ISL_FORMAT_R32_UINT;2075}20762077if (devinfo->verx10 <= 70 &&2078!isl_swizzle_is_identity(params->src.view.swizzle)) {2079wm_prog_key->src_swizzle = params->src.view.swizzle;2080params->src.view.swizzle = ISL_SWIZZLE_IDENTITY;2081} else {2082wm_prog_key->src_swizzle = ISL_SWIZZLE_IDENTITY;2083}20842085if (!isl_swizzle_supports_rendering(devinfo, params->dst.view.swizzle)) {2086wm_prog_key->dst_swizzle = params->dst.view.swizzle;2087params->dst.view.swizzle = ISL_SWIZZLE_IDENTITY;2088} else {2089wm_prog_key->dst_swizzle = ISL_SWIZZLE_IDENTITY;2090}20912092if (params->src.tile_x_sa || params->src.tile_y_sa) {2093assert(wm_prog_key->need_src_offset);2094surf_get_intratile_offset_px(¶ms->src,2095¶ms->wm_inputs.src_offset.x,2096¶ms->wm_inputs.src_offset.y);2097}20982099if (params->dst.tile_x_sa || params->dst.tile_y_sa) {2100assert(wm_prog_key->need_dst_offset);2101surf_get_intratile_offset_px(¶ms->dst,2102¶ms->wm_inputs.dst_offset.x,2103¶ms->wm_inputs.dst_offset.y);2104params->x0 += params->wm_inputs.dst_offset.x;2105params->y0 += params->wm_inputs.dst_offset.y;2106params->x1 += params->wm_inputs.dst_offset.x;2107params->y1 += params->wm_inputs.dst_offset.y;2108}21092110/* For some texture types, we need to pass the layer through the sampler. */2111params->wm_inputs.src_z = params->src.z_offset;21122113if (!brw_blorp_get_blit_kernel(batch, params, wm_prog_key))2114return 0;21152116if (!blorp_ensure_sf_program(batch, params))2117return 0;21182119unsigned result = 0;2120unsigned max_src_surface_size = get_max_surface_size(devinfo, ¶ms->src);2121if (params->src.surf.logical_level0_px.width > max_src_surface_size)2122result |= BLIT_SRC_WIDTH_SHRINK;2123if (params->src.surf.logical_level0_px.height > max_src_surface_size)2124result |= BLIT_SRC_HEIGHT_SHRINK;21252126unsigned max_dst_surface_size = get_max_surface_size(devinfo, ¶ms->dst);2127if (params->dst.surf.logical_level0_px.width > max_dst_surface_size)2128result |= BLIT_DST_WIDTH_SHRINK;2129if (params->dst.surf.logical_level0_px.height > max_dst_surface_size)2130result |= BLIT_DST_HEIGHT_SHRINK;21312132if (result == 0) {2133if (wm_prog_key->dst_usage == ISL_SURF_USAGE_DEPTH_BIT) {2134params->depth = params->dst;2135memset(¶ms->dst, 0, sizeof(params->dst));2136} else if (wm_prog_key->dst_usage == ISL_SURF_USAGE_STENCIL_BIT) {2137params->stencil = params->dst;2138params->stencil_mask = 0xff;2139memset(¶ms->dst, 0, sizeof(params->dst));2140}21412142batch->blorp->exec(batch, params);2143}21442145return result;2146}21472148/* Adjust split blit source coordinates for the current destination2149* coordinates.2150*/2151static void2152adjust_split_source_coords(const struct blt_axis *orig,2153struct blt_axis *split_coords,2154double scale)2155{2156/* When scale is greater than 0, then we are growing from the start, so2157* src0 uses delta0, and src1 uses delta1. When scale is less than 0, the2158* source range shrinks from the end. In that case src0 is adjusted by2159* delta1, and src1 is adjusted by delta0.2160*/2161double delta0 = scale * (split_coords->dst0 - orig->dst0);2162double delta1 = scale * (split_coords->dst1 - orig->dst1);2163split_coords->src0 = orig->src0 + (scale >= 0.0 ? delta0 : delta1);2164split_coords->src1 = orig->src1 + (scale >= 0.0 ? delta1 : delta0);2165}21662167static struct isl_extent2d2168get_px_size_sa(const struct isl_surf *surf)2169{2170static const struct isl_extent2d one_to_one = { .w = 1, .h = 1 };21712172if (surf->msaa_layout != ISL_MSAA_LAYOUT_INTERLEAVED)2173return one_to_one;2174else2175return isl_get_interleaved_msaa_px_size_sa(surf->samples);2176}21772178static void2179shrink_surface_params(const struct isl_device *dev,2180struct brw_blorp_surface_info *info,2181double *x0, double *x1, double *y0, double *y1)2182{2183uint32_t byte_offset, x_offset_sa, y_offset_sa, size;2184struct isl_extent2d px_size_sa;2185int adjust;21862187blorp_surf_convert_to_single_slice(dev, info);21882189px_size_sa = get_px_size_sa(&info->surf);21902191/* Because this gets called after we lower compressed images, the tile2192* offsets may be non-zero and we need to incorporate them in our2193* calculations.2194*/2195x_offset_sa = (uint32_t)*x0 * px_size_sa.w + info->tile_x_sa;2196y_offset_sa = (uint32_t)*y0 * px_size_sa.h + info->tile_y_sa;2197uint32_t tile_z_sa, tile_a;2198isl_tiling_get_intratile_offset_sa(info->surf.tiling,2199info->surf.format, info->surf.row_pitch_B,2200info->surf.array_pitch_el_rows,2201x_offset_sa, y_offset_sa, 0, 0,2202&byte_offset,2203&info->tile_x_sa, &info->tile_y_sa,2204&tile_z_sa, &tile_a);2205assert(tile_z_sa == 0 && tile_a == 0);22062207info->addr.offset += byte_offset;22082209adjust = (int)info->tile_x_sa / px_size_sa.w - (int)*x0;2210*x0 += adjust;2211*x1 += adjust;2212info->tile_x_sa = 0;22132214adjust = (int)info->tile_y_sa / px_size_sa.h - (int)*y0;2215*y0 += adjust;2216*y1 += adjust;2217info->tile_y_sa = 0;22182219size = MIN2((uint32_t)ceil(*x1), info->surf.logical_level0_px.width);2220info->surf.logical_level0_px.width = size;2221info->surf.phys_level0_sa.width = size * px_size_sa.w;22222223size = MIN2((uint32_t)ceil(*y1), info->surf.logical_level0_px.height);2224info->surf.logical_level0_px.height = size;2225info->surf.phys_level0_sa.height = size * px_size_sa.h;2226}22272228static void2229do_blorp_blit(struct blorp_batch *batch,2230const struct blorp_params *orig_params,2231struct brw_blorp_blit_prog_key *wm_prog_key,2232const struct blt_coords *orig)2233{2234struct blorp_params params;2235struct blt_coords blit_coords;2236struct blt_coords split_coords = *orig;2237double w = orig->x.dst1 - orig->x.dst0;2238double h = orig->y.dst1 - orig->y.dst0;2239double x_scale = (orig->x.src1 - orig->x.src0) / w;2240double y_scale = (orig->y.src1 - orig->y.src0) / h;2241if (orig->x.mirror)2242x_scale = -x_scale;2243if (orig->y.mirror)2244y_scale = -y_scale;22452246enum blit_shrink_status shrink = BLIT_NO_SHRINK;2247if (split_blorp_blit_debug) {2248if (can_shrink_surface(&orig_params->src))2249shrink |= BLIT_SRC_WIDTH_SHRINK | BLIT_SRC_HEIGHT_SHRINK;2250if (can_shrink_surface(&orig_params->dst))2251shrink |= BLIT_DST_WIDTH_SHRINK | BLIT_DST_HEIGHT_SHRINK;2252}22532254bool x_done, y_done;2255do {2256params = *orig_params;2257blit_coords = split_coords;22582259if (shrink & (BLIT_SRC_WIDTH_SHRINK | BLIT_SRC_HEIGHT_SHRINK)) {2260shrink_surface_params(batch->blorp->isl_dev, ¶ms.src,2261&blit_coords.x.src0, &blit_coords.x.src1,2262&blit_coords.y.src0, &blit_coords.y.src1);2263wm_prog_key->need_src_offset = false;2264}22652266if (shrink & (BLIT_DST_WIDTH_SHRINK | BLIT_DST_HEIGHT_SHRINK)) {2267shrink_surface_params(batch->blorp->isl_dev, ¶ms.dst,2268&blit_coords.x.dst0, &blit_coords.x.dst1,2269&blit_coords.y.dst0, &blit_coords.y.dst1);2270wm_prog_key->need_dst_offset = false;2271}22722273enum blit_shrink_status result =2274try_blorp_blit(batch, ¶ms, wm_prog_key, &blit_coords);22752276if (result & (BLIT_SRC_WIDTH_SHRINK | BLIT_SRC_HEIGHT_SHRINK))2277assert(can_shrink_surface(&orig_params->src));22782279if (result & (BLIT_DST_WIDTH_SHRINK | BLIT_DST_HEIGHT_SHRINK))2280assert(can_shrink_surface(&orig_params->dst));22812282if (result & (BLIT_SRC_WIDTH_SHRINK | BLIT_DST_WIDTH_SHRINK)) {2283w /= 2.0;2284assert(w >= 1.0);2285split_coords.x.dst1 = MIN2(split_coords.x.dst0 + w, orig->x.dst1);2286adjust_split_source_coords(&orig->x, &split_coords.x, x_scale);2287}2288if (result & (BLIT_SRC_HEIGHT_SHRINK | BLIT_DST_HEIGHT_SHRINK)) {2289h /= 2.0;2290assert(h >= 1.0);2291split_coords.y.dst1 = MIN2(split_coords.y.dst0 + h, orig->y.dst1);2292adjust_split_source_coords(&orig->y, &split_coords.y, y_scale);2293}22942295if (result) {2296/* We may get less bits set on result than we had already, so make2297* sure we remember all the ways in which a resize is required.2298*/2299shrink |= result;2300continue;2301}23022303y_done = (orig->y.dst1 - split_coords.y.dst1 < 0.5);2304x_done = y_done && (orig->x.dst1 - split_coords.x.dst1 < 0.5);2305if (x_done) {2306break;2307} else if (y_done) {2308split_coords.x.dst0 += w;2309split_coords.x.dst1 = MIN2(split_coords.x.dst0 + w, orig->x.dst1);2310split_coords.y.dst0 = orig->y.dst0;2311split_coords.y.dst1 = MIN2(split_coords.y.dst0 + h, orig->y.dst1);2312adjust_split_source_coords(&orig->x, &split_coords.x, x_scale);2313} else {2314split_coords.y.dst0 += h;2315split_coords.y.dst1 = MIN2(split_coords.y.dst0 + h, orig->y.dst1);2316adjust_split_source_coords(&orig->y, &split_coords.y, y_scale);2317}2318} while (true);2319}23202321void2322blorp_blit(struct blorp_batch *batch,2323const struct blorp_surf *src_surf,2324unsigned src_level, float src_layer,2325enum isl_format src_format, struct isl_swizzle src_swizzle,2326const struct blorp_surf *dst_surf,2327unsigned dst_level, unsigned dst_layer,2328enum isl_format dst_format, struct isl_swizzle dst_swizzle,2329float src_x0, float src_y0,2330float src_x1, float src_y1,2331float dst_x0, float dst_y0,2332float dst_x1, float dst_y1,2333enum blorp_filter filter,2334bool mirror_x, bool mirror_y)2335{2336struct blorp_params params;2337blorp_params_init(¶ms);2338params.snapshot_type = INTEL_SNAPSHOT_BLIT;23392340/* We cannot handle combined depth and stencil. */2341if (src_surf->surf->usage & ISL_SURF_USAGE_STENCIL_BIT)2342assert(src_surf->surf->format == ISL_FORMAT_R8_UINT);2343if (dst_surf->surf->usage & ISL_SURF_USAGE_STENCIL_BIT)2344assert(dst_surf->surf->format == ISL_FORMAT_R8_UINT);23452346if (dst_surf->surf->usage & ISL_SURF_USAGE_STENCIL_BIT) {2347assert(src_surf->surf->usage & ISL_SURF_USAGE_STENCIL_BIT);2348/* Prior to Broadwell, we can't render to R8_UINT */2349if (batch->blorp->isl_dev->info->ver < 8) {2350src_format = ISL_FORMAT_R8_UNORM;2351dst_format = ISL_FORMAT_R8_UNORM;2352}2353}23542355brw_blorp_surface_info_init(batch->blorp, ¶ms.src, src_surf, src_level,2356src_layer, src_format, false);2357brw_blorp_surface_info_init(batch->blorp, ¶ms.dst, dst_surf, dst_level,2358dst_layer, dst_format, true);23592360params.src.view.swizzle = src_swizzle;2361params.dst.view.swizzle = dst_swizzle;23622363const struct isl_format_layout *src_fmtl =2364isl_format_get_layout(params.src.view.format);23652366struct brw_blorp_blit_prog_key wm_prog_key = {2367.base = BRW_BLORP_BASE_KEY_INIT(BLORP_SHADER_TYPE_BLIT),2368.filter = filter,2369.sint32_to_uint = src_fmtl->channels.r.bits == 32 &&2370isl_format_has_sint_channel(params.src.view.format) &&2371isl_format_has_uint_channel(params.dst.view.format),2372.uint32_to_sint = src_fmtl->channels.r.bits == 32 &&2373isl_format_has_uint_channel(params.src.view.format) &&2374isl_format_has_sint_channel(params.dst.view.format),2375};23762377/* Scaling factors used for bilinear filtering in multisample scaled2378* blits.2379*/2380if (params.src.surf.samples == 16)2381wm_prog_key.x_scale = 4.0f;2382else2383wm_prog_key.x_scale = 2.0f;2384wm_prog_key.y_scale = params.src.surf.samples / wm_prog_key.x_scale;23852386params.wm_inputs.rect_grid.x1 =2387minify(params.src.surf.logical_level0_px.width, src_level) *2388wm_prog_key.x_scale - 1.0f;2389params.wm_inputs.rect_grid.y1 =2390minify(params.src.surf.logical_level0_px.height, src_level) *2391wm_prog_key.y_scale - 1.0f;23922393struct blt_coords coords = {2394.x = {2395.src0 = src_x0,2396.src1 = src_x1,2397.dst0 = dst_x0,2398.dst1 = dst_x1,2399.mirror = mirror_x2400},2401.y = {2402.src0 = src_y0,2403.src1 = src_y1,2404.dst0 = dst_y0,2405.dst1 = dst_y1,2406.mirror = mirror_y2407}2408};24092410do_blorp_blit(batch, ¶ms, &wm_prog_key, &coords);2411}24122413static enum isl_format2414get_copy_format_for_bpb(const struct isl_device *isl_dev, unsigned bpb)2415{2416/* The choice of UNORM and UINT formats is very intentional here. Most2417* of the time, we want to use a UINT format to avoid any rounding error2418* in the blit. For stencil blits, R8_UINT is required by the hardware.2419* (It's the only format allowed in conjunction with W-tiling.) Also we2420* intentionally use the 4-channel formats whenever we can. This is so2421* that, when we do a RGB <-> RGBX copy, the two formats will line up2422* even though one of them is 3/4 the size of the other. The choice of2423* UNORM vs. UINT is also very intentional because we don't have 8 or2424* 16-bit RGB UINT formats until Sky Lake so we have to use UNORM there.2425* Fortunately, the only time we should ever use two different formats in2426* the table below is for RGB -> RGBA blits and so we will never have any2427* UNORM/UINT mismatch.2428*/2429if (ISL_GFX_VER(isl_dev) >= 9) {2430switch (bpb) {2431case 8: return ISL_FORMAT_R8_UINT;2432case 16: return ISL_FORMAT_R8G8_UINT;2433case 24: return ISL_FORMAT_R8G8B8_UINT;2434case 32: return ISL_FORMAT_R8G8B8A8_UINT;2435case 48: return ISL_FORMAT_R16G16B16_UINT;2436case 64: return ISL_FORMAT_R16G16B16A16_UINT;2437case 96: return ISL_FORMAT_R32G32B32_UINT;2438case 128:return ISL_FORMAT_R32G32B32A32_UINT;2439default:2440unreachable("Unknown format bpb");2441}2442} else {2443switch (bpb) {2444case 8: return ISL_FORMAT_R8_UINT;2445case 16: return ISL_FORMAT_R8G8_UINT;2446case 24: return ISL_FORMAT_R8G8B8_UNORM;2447case 32: return ISL_FORMAT_R8G8B8A8_UNORM;2448case 48: return ISL_FORMAT_R16G16B16_UNORM;2449case 64: return ISL_FORMAT_R16G16B16A16_UNORM;2450case 96: return ISL_FORMAT_R32G32B32_UINT;2451case 128:return ISL_FORMAT_R32G32B32A32_UINT;2452default:2453unreachable("Unknown format bpb");2454}2455}2456}24572458/** Returns a UINT format that is CCS-compatible with the given format2459*2460* The PRM's say absolutely nothing about how render compression works. The2461* only thing they provide is a list of formats on which it is and is not2462* supported. Empirical testing indicates that the compression is only based2463* on the bit-layout of the format and the channel encoding doesn't matter.2464* So, while texture views don't work in general, you can create a view as2465* long as the bit-layout of the formats are the same.2466*2467* Fortunately, for every render compression capable format, the UINT format2468* with the same bit layout also supports render compression. This means that2469* we only need to handle UINT formats for copy operations. In order to do2470* copies between formats with different bit layouts, we attach both with a2471* UINT format and use bit_cast_color() to generate code to do the bit-cast2472* operation between the two bit layouts.2473*/2474static enum isl_format2475get_ccs_compatible_copy_format(const struct isl_format_layout *fmtl)2476{2477switch (fmtl->format) {2478case ISL_FORMAT_R32G32B32A32_FLOAT:2479case ISL_FORMAT_R32G32B32A32_SINT:2480case ISL_FORMAT_R32G32B32A32_UINT:2481case ISL_FORMAT_R32G32B32A32_UNORM:2482case ISL_FORMAT_R32G32B32A32_SNORM:2483case ISL_FORMAT_R32G32B32X32_FLOAT:2484return ISL_FORMAT_R32G32B32A32_UINT;24852486case ISL_FORMAT_R16G16B16A16_UNORM:2487case ISL_FORMAT_R16G16B16A16_SNORM:2488case ISL_FORMAT_R16G16B16A16_SINT:2489case ISL_FORMAT_R16G16B16A16_UINT:2490case ISL_FORMAT_R16G16B16A16_FLOAT:2491case ISL_FORMAT_R16G16B16X16_UNORM:2492case ISL_FORMAT_R16G16B16X16_FLOAT:2493return ISL_FORMAT_R16G16B16A16_UINT;24942495case ISL_FORMAT_R32G32_FLOAT:2496case ISL_FORMAT_R32G32_SINT:2497case ISL_FORMAT_R32G32_UINT:2498case ISL_FORMAT_R32G32_UNORM:2499case ISL_FORMAT_R32G32_SNORM:2500return ISL_FORMAT_R32G32_UINT;25012502case ISL_FORMAT_B8G8R8A8_UNORM:2503case ISL_FORMAT_B8G8R8A8_UNORM_SRGB:2504case ISL_FORMAT_R8G8B8A8_UNORM:2505case ISL_FORMAT_R8G8B8A8_UNORM_SRGB:2506case ISL_FORMAT_R8G8B8A8_SNORM:2507case ISL_FORMAT_R8G8B8A8_SINT:2508case ISL_FORMAT_R8G8B8A8_UINT:2509case ISL_FORMAT_B8G8R8X8_UNORM:2510case ISL_FORMAT_B8G8R8X8_UNORM_SRGB:2511case ISL_FORMAT_R8G8B8X8_UNORM:2512case ISL_FORMAT_R8G8B8X8_UNORM_SRGB:2513return ISL_FORMAT_R8G8B8A8_UINT;25142515case ISL_FORMAT_R16G16_UNORM:2516case ISL_FORMAT_R16G16_SNORM:2517case ISL_FORMAT_R16G16_SINT:2518case ISL_FORMAT_R16G16_UINT:2519case ISL_FORMAT_R16G16_FLOAT:2520return ISL_FORMAT_R16G16_UINT;25212522case ISL_FORMAT_R32_SINT:2523case ISL_FORMAT_R32_UINT:2524case ISL_FORMAT_R32_FLOAT:2525case ISL_FORMAT_R32_UNORM:2526case ISL_FORMAT_R32_SNORM:2527return ISL_FORMAT_R32_UINT;25282529case ISL_FORMAT_B10G10R10A2_UNORM:2530case ISL_FORMAT_B10G10R10A2_UNORM_SRGB:2531case ISL_FORMAT_R10G10B10A2_UNORM:2532case ISL_FORMAT_R10G10B10A2_UNORM_SRGB:2533case ISL_FORMAT_R10G10B10_FLOAT_A2_UNORM:2534case ISL_FORMAT_R10G10B10A2_UINT:2535return ISL_FORMAT_R10G10B10A2_UINT;25362537case ISL_FORMAT_R16_UNORM:2538case ISL_FORMAT_R16_SNORM:2539case ISL_FORMAT_R16_SINT:2540case ISL_FORMAT_R16_UINT:2541case ISL_FORMAT_R16_FLOAT:2542return ISL_FORMAT_R16_UINT;25432544case ISL_FORMAT_R8G8_UNORM:2545case ISL_FORMAT_R8G8_SNORM:2546case ISL_FORMAT_R8G8_SINT:2547case ISL_FORMAT_R8G8_UINT:2548return ISL_FORMAT_R8G8_UINT;25492550case ISL_FORMAT_B5G5R5X1_UNORM:2551case ISL_FORMAT_B5G5R5X1_UNORM_SRGB:2552case ISL_FORMAT_B5G5R5A1_UNORM:2553case ISL_FORMAT_B5G5R5A1_UNORM_SRGB:2554return ISL_FORMAT_B5G5R5A1_UNORM;25552556case ISL_FORMAT_A4B4G4R4_UNORM:2557case ISL_FORMAT_B4G4R4A4_UNORM:2558case ISL_FORMAT_B4G4R4A4_UNORM_SRGB:2559return ISL_FORMAT_B4G4R4A4_UNORM;25602561case ISL_FORMAT_B5G6R5_UNORM:2562case ISL_FORMAT_B5G6R5_UNORM_SRGB:2563return ISL_FORMAT_B5G6R5_UNORM;25642565case ISL_FORMAT_A1B5G5R5_UNORM:2566return ISL_FORMAT_A1B5G5R5_UNORM;25672568case ISL_FORMAT_A8_UNORM:2569case ISL_FORMAT_R8_UNORM:2570case ISL_FORMAT_R8_SNORM:2571case ISL_FORMAT_R8_SINT:2572case ISL_FORMAT_R8_UINT:2573return ISL_FORMAT_R8_UINT;25742575default:2576unreachable("Not a compressible format");2577}2578}25792580void2581blorp_surf_convert_to_uncompressed(const struct isl_device *isl_dev,2582struct brw_blorp_surface_info *info,2583uint32_t *x, uint32_t *y,2584uint32_t *width, uint32_t *height)2585{2586const struct isl_format_layout *fmtl =2587isl_format_get_layout(info->surf.format);25882589assert(fmtl->bw > 1 || fmtl->bh > 1);25902591/* This should be the first modification made to the surface */2592assert(info->tile_x_sa == 0 && info->tile_y_sa == 0);25932594if (width && height) {2595ASSERTED const uint32_t level_width =2596minify(info->surf.logical_level0_px.width, info->view.base_level);2597ASSERTED const uint32_t level_height =2598minify(info->surf.logical_level0_px.height, info->view.base_level);2599assert(*width % fmtl->bw == 0 || *x + *width == level_width);2600assert(*height % fmtl->bh == 0 || *y + *height == level_height);2601*width = DIV_ROUND_UP(*width, fmtl->bw);2602*height = DIV_ROUND_UP(*height, fmtl->bh);2603}26042605if (x && y) {2606assert(*x % fmtl->bw == 0);2607assert(*y % fmtl->bh == 0);2608*x /= fmtl->bw;2609*y /= fmtl->bh;2610}26112612/* We only want one level and slice */2613info->view.levels = 1;2614info->view.array_len = 1;26152616if (info->surf.dim == ISL_SURF_DIM_3D) {2617/* Roll the Z offset into the image view */2618info->view.base_array_layer += info->z_offset;2619info->z_offset = 0;2620}26212622uint32_t offset_B;2623ASSERTED bool ok =2624isl_surf_get_uncompressed_surf(isl_dev, &info->surf, &info->view,2625&info->surf, &info->view, &offset_B,2626&info->tile_x_sa, &info->tile_y_sa);2627assert(ok);2628info->addr.offset += offset_B;26292630/* BLORP doesn't use the actual intratile offsets. Instead, it needs the2631* surface to be a bit bigger and we offset the vertices instead.2632*/2633assert(info->surf.dim == ISL_SURF_DIM_2D);2634assert(info->surf.logical_level0_px.array_len == 1);2635info->surf.logical_level0_px.w += info->tile_x_sa;2636info->surf.logical_level0_px.h += info->tile_y_sa;2637info->surf.phys_level0_sa.w += info->tile_x_sa;2638info->surf.phys_level0_sa.h += info->tile_y_sa;2639}26402641void2642blorp_copy(struct blorp_batch *batch,2643const struct blorp_surf *src_surf,2644unsigned src_level, unsigned src_layer,2645const struct blorp_surf *dst_surf,2646unsigned dst_level, unsigned dst_layer,2647uint32_t src_x, uint32_t src_y,2648uint32_t dst_x, uint32_t dst_y,2649uint32_t src_width, uint32_t src_height)2650{2651const struct isl_device *isl_dev = batch->blorp->isl_dev;2652struct blorp_params params;26532654if (src_width == 0 || src_height == 0)2655return;26562657blorp_params_init(¶ms);2658params.snapshot_type = INTEL_SNAPSHOT_COPY;2659brw_blorp_surface_info_init(batch->blorp, ¶ms.src, src_surf, src_level,2660src_layer, ISL_FORMAT_UNSUPPORTED, false);2661brw_blorp_surface_info_init(batch->blorp, ¶ms.dst, dst_surf, dst_level,2662dst_layer, ISL_FORMAT_UNSUPPORTED, true);26632664struct brw_blorp_blit_prog_key wm_prog_key = {2665.base = BRW_BLORP_BASE_KEY_INIT(BLORP_SHADER_TYPE_COPY),2666.filter = BLORP_FILTER_NONE,2667.need_src_offset = src_surf->tile_x_sa || src_surf->tile_y_sa,2668.need_dst_offset = dst_surf->tile_x_sa || dst_surf->tile_y_sa,2669};26702671const struct isl_format_layout *src_fmtl =2672isl_format_get_layout(params.src.surf.format);2673const struct isl_format_layout *dst_fmtl =2674isl_format_get_layout(params.dst.surf.format);26752676assert(params.src.aux_usage == ISL_AUX_USAGE_NONE ||2677params.src.aux_usage == ISL_AUX_USAGE_HIZ ||2678params.src.aux_usage == ISL_AUX_USAGE_HIZ_CCS_WT ||2679params.src.aux_usage == ISL_AUX_USAGE_MCS ||2680params.src.aux_usage == ISL_AUX_USAGE_MCS_CCS ||2681params.src.aux_usage == ISL_AUX_USAGE_CCS_E ||2682params.src.aux_usage == ISL_AUX_USAGE_GFX12_CCS_E ||2683params.src.aux_usage == ISL_AUX_USAGE_STC_CCS);26842685if (isl_aux_usage_has_hiz(params.src.aux_usage)) {2686/* In order to use HiZ, we have to use the real format for the source.2687* Depth <-> Color copies are not allowed.2688*/2689params.src.view.format = params.src.surf.format;2690params.dst.view.format = params.src.surf.format;2691} else if ((params.dst.surf.usage & ISL_SURF_USAGE_DEPTH_BIT) &&2692isl_dev->info->ver >= 7) {2693/* On Gfx7 and higher, we use actual depth writes for blits into depth2694* buffers so we need the real format.2695*/2696params.src.view.format = params.dst.surf.format;2697params.dst.view.format = params.dst.surf.format;2698} else if (params.dst.aux_usage == ISL_AUX_USAGE_CCS_E ||2699params.dst.aux_usage == ISL_AUX_USAGE_GFX12_CCS_E) {2700params.dst.view.format = get_ccs_compatible_copy_format(dst_fmtl);2701if (params.src.aux_usage == ISL_AUX_USAGE_CCS_E ||2702params.src.aux_usage == ISL_AUX_USAGE_GFX12_CCS_E) {2703params.src.view.format = get_ccs_compatible_copy_format(src_fmtl);2704} else if (src_fmtl->bpb == dst_fmtl->bpb) {2705params.src.view.format = params.dst.view.format;2706} else {2707params.src.view.format =2708get_copy_format_for_bpb(isl_dev, src_fmtl->bpb);2709}2710} else if (params.src.aux_usage == ISL_AUX_USAGE_CCS_E ||2711params.src.aux_usage == ISL_AUX_USAGE_GFX12_CCS_E) {2712params.src.view.format = get_ccs_compatible_copy_format(src_fmtl);2713if (src_fmtl->bpb == dst_fmtl->bpb) {2714params.dst.view.format = params.src.view.format;2715} else {2716params.dst.view.format =2717get_copy_format_for_bpb(isl_dev, dst_fmtl->bpb);2718}2719} else {2720params.dst.view.format = get_copy_format_for_bpb(isl_dev, dst_fmtl->bpb);2721params.src.view.format = get_copy_format_for_bpb(isl_dev, src_fmtl->bpb);2722}27232724if (params.src.view.format != params.dst.view.format) {2725enum isl_format src_cast_format = params.src.view.format;2726enum isl_format dst_cast_format = params.dst.view.format;27272728/* The BLORP bitcast code gets confused by RGB formats. Just treat them2729* as RGBA and then everything will be happy. This is perfectly safe2730* because BLORP likes to treat things as if they have vec4 colors all2731* the time anyway.2732*/2733if (isl_format_get_layout(src_cast_format)->bpb % 3 == 0)2734src_cast_format = isl_format_rgb_to_rgba(src_cast_format);2735if (isl_format_get_layout(dst_cast_format)->bpb % 3 == 0)2736dst_cast_format = isl_format_rgb_to_rgba(dst_cast_format);27372738if (src_cast_format != dst_cast_format) {2739wm_prog_key.format_bit_cast = true;2740wm_prog_key.src_format = src_cast_format;2741wm_prog_key.dst_format = dst_cast_format;2742}2743}27442745if (src_fmtl->bw > 1 || src_fmtl->bh > 1) {2746blorp_surf_convert_to_uncompressed(batch->blorp->isl_dev, ¶ms.src,2747&src_x, &src_y,2748&src_width, &src_height);2749wm_prog_key.need_src_offset = true;2750}27512752if (dst_fmtl->bw > 1 || dst_fmtl->bh > 1) {2753blorp_surf_convert_to_uncompressed(batch->blorp->isl_dev, ¶ms.dst,2754&dst_x, &dst_y, NULL, NULL);2755wm_prog_key.need_dst_offset = true;2756}27572758/* Once both surfaces are stompped to uncompressed as needed, the2759* destination size is the same as the source size.2760*/2761uint32_t dst_width = src_width;2762uint32_t dst_height = src_height;27632764struct blt_coords coords = {2765.x = {2766.src0 = src_x,2767.src1 = src_x + src_width,2768.dst0 = dst_x,2769.dst1 = dst_x + dst_width,2770.mirror = false2771},2772.y = {2773.src0 = src_y,2774.src1 = src_y + src_height,2775.dst0 = dst_y,2776.dst1 = dst_y + dst_height,2777.mirror = false2778}2779};27802781do_blorp_blit(batch, ¶ms, &wm_prog_key, &coords);2782}27832784static enum isl_format2785isl_format_for_size(unsigned size_B)2786{2787switch (size_B) {2788case 1: return ISL_FORMAT_R8_UINT;2789case 2: return ISL_FORMAT_R8G8_UINT;2790case 4: return ISL_FORMAT_R8G8B8A8_UINT;2791case 8: return ISL_FORMAT_R16G16B16A16_UINT;2792case 16: return ISL_FORMAT_R32G32B32A32_UINT;2793default:2794unreachable("Not a power-of-two format size");2795}2796}27972798/**2799* Returns the greatest common divisor of a and b that is a power of two.2800*/2801static uint64_t2802gcd_pow2_u64(uint64_t a, uint64_t b)2803{2804assert(a > 0 || b > 0);28052806unsigned a_log2 = ffsll(a) - 1;2807unsigned b_log2 = ffsll(b) - 1;28082809/* If either a or b is 0, then a_log2 or b_log2 till be UINT_MAX in which2810* case, the MIN2() will take the other one. If both are 0 then we will2811* hit the assert above.2812*/2813return 1 << MIN2(a_log2, b_log2);2814}28152816static void2817do_buffer_copy(struct blorp_batch *batch,2818struct blorp_address *src,2819struct blorp_address *dst,2820int width, int height, int block_size)2821{2822/* The actual format we pick doesn't matter as blorp will throw it away.2823* The only thing that actually matters is the size.2824*/2825enum isl_format format = isl_format_for_size(block_size);28262827UNUSED bool ok;2828struct isl_surf surf;2829ok = isl_surf_init(batch->blorp->isl_dev, &surf,2830.dim = ISL_SURF_DIM_2D,2831.format = format,2832.width = width,2833.height = height,2834.depth = 1,2835.levels = 1,2836.array_len = 1,2837.samples = 1,2838.row_pitch_B = width * block_size,2839.usage = ISL_SURF_USAGE_TEXTURE_BIT |2840ISL_SURF_USAGE_RENDER_TARGET_BIT,2841.tiling_flags = ISL_TILING_LINEAR_BIT);2842assert(ok);28432844struct blorp_surf src_blorp_surf = {2845.surf = &surf,2846.addr = *src,2847};28482849struct blorp_surf dst_blorp_surf = {2850.surf = &surf,2851.addr = *dst,2852};28532854blorp_copy(batch, &src_blorp_surf, 0, 0, &dst_blorp_surf, 0, 0,28550, 0, 0, 0, width, height);2856}28572858void2859blorp_buffer_copy(struct blorp_batch *batch,2860struct blorp_address src,2861struct blorp_address dst,2862uint64_t size)2863{2864const struct intel_device_info *devinfo = batch->blorp->isl_dev->info;2865uint64_t copy_size = size;28662867/* This is maximum possible width/height our HW can handle */2868uint64_t max_surface_dim = 1 << (devinfo->ver >= 7 ? 14 : 13);28692870/* First, we compute the biggest format that can be used with the2871* given offsets and size.2872*/2873int bs = 16;2874bs = gcd_pow2_u64(bs, src.offset);2875bs = gcd_pow2_u64(bs, dst.offset);2876bs = gcd_pow2_u64(bs, size);28772878/* First, we make a bunch of max-sized copies */2879uint64_t max_copy_size = max_surface_dim * max_surface_dim * bs;2880while (copy_size >= max_copy_size) {2881do_buffer_copy(batch, &src, &dst, max_surface_dim, max_surface_dim, bs);2882copy_size -= max_copy_size;2883src.offset += max_copy_size;2884dst.offset += max_copy_size;2885}28862887/* Now make a max-width copy */2888uint64_t height = copy_size / (max_surface_dim * bs);2889assert(height < max_surface_dim);2890if (height != 0) {2891uint64_t rect_copy_size = height * max_surface_dim * bs;2892do_buffer_copy(batch, &src, &dst, max_surface_dim, height, bs);2893copy_size -= rect_copy_size;2894src.offset += rect_copy_size;2895dst.offset += rect_copy_size;2896}28972898/* Finally, make a small copy to finish it off */2899if (copy_size != 0) {2900do_buffer_copy(batch, &src, &dst, copy_size / bs, 1, bs);2901}2902}290329042905