Path: blob/21.2-virgl/src/gallium/drivers/vc4/vc4_qpu_validate.c
4570 views
1/*2* Copyright © 2014 Broadcom3*4* Permission is hereby granted, free of charge, to any person obtaining a5* copy of this software and associated documentation files (the "Software"),6* to deal in the Software without restriction, including without limitation7* the rights to use, copy, modify, merge, publish, distribute, sublicense,8* and/or sell copies of the Software, and to permit persons to whom the9* Software is furnished to do so, subject to the following conditions:10*11* The above copyright notice and this permission notice (including the next12* paragraph) shall be included in all copies or substantial portions of the13* Software.14*15* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR16* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,17* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL18* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER19* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING20* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS21* IN THE SOFTWARE.22*/2324#include <stdlib.h>2526#include "vc4_qpu.h"2728static void29fail_instr(uint64_t inst, const char *msg)30{31fprintf(stderr, "vc4_qpu_validate: %s: ", msg);32vc4_qpu_disasm(&inst, 1);33fprintf(stderr, "\n");34abort();35}3637static bool38writes_reg(uint64_t inst, uint32_t w)39{40return (QPU_GET_FIELD(inst, QPU_WADDR_ADD) == w ||41QPU_GET_FIELD(inst, QPU_WADDR_MUL) == w);42}4344static bool45_reads_reg(uint64_t inst, uint32_t r, bool ignore_a, bool ignore_b)46{47struct {48uint32_t mux, addr;49} src_regs[] = {50{ QPU_GET_FIELD(inst, QPU_ADD_A) },51{ QPU_GET_FIELD(inst, QPU_ADD_B) },52{ QPU_GET_FIELD(inst, QPU_MUL_A) },53{ QPU_GET_FIELD(inst, QPU_MUL_B) },54};5556/* Branches only reference raddr_a (no mux), and we don't use that57* feature of branching.58*/59if (QPU_GET_FIELD(inst, QPU_SIG) == QPU_SIG_BRANCH)60return false;6162/* Load immediates don't read any registers. */63if (QPU_GET_FIELD(inst, QPU_SIG) == QPU_SIG_LOAD_IMM)64return false;6566for (int i = 0; i < ARRAY_SIZE(src_regs); i++) {67if (!ignore_a &&68src_regs[i].mux == QPU_MUX_A &&69(QPU_GET_FIELD(inst, QPU_RADDR_A) == r))70return true;7172if (!ignore_b &&73QPU_GET_FIELD(inst, QPU_SIG) != QPU_SIG_SMALL_IMM &&74src_regs[i].mux == QPU_MUX_B &&75(QPU_GET_FIELD(inst, QPU_RADDR_B) == r))76return true;77}7879return false;80}8182static bool83reads_reg(uint64_t inst, uint32_t r)84{85return _reads_reg(inst, r, false, false);86}8788static bool89reads_a_reg(uint64_t inst, uint32_t r)90{91return _reads_reg(inst, r, false, true);92}9394static bool95reads_b_reg(uint64_t inst, uint32_t r)96{97return _reads_reg(inst, r, true, false);98}99100static bool101writes_sfu(uint64_t inst)102{103return (writes_reg(inst, QPU_W_SFU_RECIP) ||104writes_reg(inst, QPU_W_SFU_RECIPSQRT) ||105writes_reg(inst, QPU_W_SFU_EXP) ||106writes_reg(inst, QPU_W_SFU_LOG));107}108109/**110* Checks for the instruction restrictions from page 37 ("Summary of111* Instruction Restrictions").112*/113void114vc4_qpu_validate(uint64_t *insts, uint32_t num_inst)115{116bool scoreboard_locked = false;117bool threaded = false;118119/* We don't want to do validation in release builds, but we want to120* keep compiling the validation code to make sure it doesn't get121* broken.122*/123#ifndef DEBUG124return;125#endif126127for (int i = 0; i < num_inst; i++) {128uint64_t inst = insts[i];129uint32_t sig = QPU_GET_FIELD(inst, QPU_SIG);130131if (sig != QPU_SIG_PROG_END) {132if (qpu_inst_is_tlb(inst))133scoreboard_locked = true;134135if (sig == QPU_SIG_THREAD_SWITCH ||136sig == QPU_SIG_LAST_THREAD_SWITCH) {137threaded = true;138}139140continue;141}142143/* "The Thread End instruction must not write to either physical144* regfile A or B."145*/146if (QPU_GET_FIELD(inst, QPU_WADDR_ADD) < 32 ||147QPU_GET_FIELD(inst, QPU_WADDR_MUL) < 32) {148fail_instr(inst, "write to phys reg in thread end");149}150151/* Can't trigger an implicit wait on scoreboard in the program152* end instruction.153*/154if (qpu_inst_is_tlb(inst) && !scoreboard_locked)155fail_instr(inst, "implicit sb wait in program end");156157/* Two delay slots will be executed. */158assert(i + 2 <= num_inst);159160for (int j = i; j < i + 2; j++) {161/* "The last three instructions of any program162* (Thread End plus the following two delay-slot163* instructions) must not do varyings read, uniforms164* read or any kind of VPM, VDR, or VDW read or165* write."166*/167if (writes_reg(insts[j], QPU_W_VPM) ||168reads_reg(insts[j], QPU_R_VARY) ||169reads_reg(insts[j], QPU_R_UNIF) ||170reads_reg(insts[j], QPU_R_VPM)) {171fail_instr(insts[j], "last 3 instructions "172"using fixed functions");173}174175/* "The Thread End instruction and the following two176* delay slot instructions must not write or read177* address 14 in either regfile A or B."178*/179if (writes_reg(insts[j], 14) ||180reads_reg(insts[j], 14)) {181fail_instr(insts[j], "last 3 instructions "182"must not use r14");183}184}185186/* "The final program instruction (the second delay slot187* instruction) must not do a TLB Z write."188*/189if (writes_reg(insts[i + 2], QPU_W_TLB_Z)) {190fail_instr(insts[i + 2], "final instruction doing "191"Z write");192}193}194195/* "A scoreboard wait must not occur in the first two instructions of196* a fragment shader. This is either the explicit Wait for Scoreboard197* signal or an implicit wait with the first tile-buffer read or198* write instruction."199*/200for (int i = 0; i < 2; i++) {201uint64_t inst = insts[i];202203if (qpu_inst_is_tlb(inst))204fail_instr(inst, "sb wait in first two insts");205}206207/* "If TMU_NOSWAP is written, the write must be three instructions208* before the first TMU write instruction. For example, if209* TMU_NOSWAP is written in the first shader instruction, the first210* TMU write cannot occur before the 4th shader instruction."211*/212int last_tmu_noswap = -10;213for (int i = 0; i < num_inst; i++) {214uint64_t inst = insts[i];215216if ((i - last_tmu_noswap) <= 3 &&217(writes_reg(inst, QPU_W_TMU0_S) ||218writes_reg(inst, QPU_W_TMU1_S))) {219fail_instr(inst, "TMU write too soon after TMU_NOSWAP");220}221222if (writes_reg(inst, QPU_W_TMU_NOSWAP))223last_tmu_noswap = i;224}225226/* "An instruction must not read from a location in physical regfile A227* or B that was written to by the previous instruction."228*/229for (int i = 0; i < num_inst - 1; i++) {230uint64_t inst = insts[i];231uint32_t add_waddr = QPU_GET_FIELD(inst, QPU_WADDR_ADD);232uint32_t mul_waddr = QPU_GET_FIELD(inst, QPU_WADDR_MUL);233uint32_t waddr_a, waddr_b;234235if (inst & QPU_WS) {236waddr_b = add_waddr;237waddr_a = mul_waddr;238} else {239waddr_a = add_waddr;240waddr_b = mul_waddr;241}242243if ((waddr_a < 32 && reads_a_reg(insts[i + 1], waddr_a)) ||244(waddr_b < 32 && reads_b_reg(insts[i + 1], waddr_b))) {245fail_instr(insts[i + 1],246"Reads physical reg too soon after write");247}248}249250/* "After an SFU lookup instruction, accumulator r4 must not be read251* in the following two instructions. Any other instruction that252* results in r4 being written (that is, TMU read, TLB read, SFU253* lookup) cannot occur in the two instructions following an SFU254* lookup."255*/256int last_sfu_inst = -10;257for (int i = 0; i < num_inst - 1; i++) {258uint64_t inst = insts[i];259uint32_t sig = QPU_GET_FIELD(inst, QPU_SIG);260261if (i - last_sfu_inst <= 2 &&262(writes_sfu(inst) ||263sig == QPU_SIG_LOAD_TMU0 ||264sig == QPU_SIG_LOAD_TMU1 ||265sig == QPU_SIG_COLOR_LOAD)) {266fail_instr(inst, "R4 write too soon after SFU write");267}268269if (writes_sfu(inst))270last_sfu_inst = i;271}272273for (int i = 0; i < num_inst - 1; i++) {274uint64_t inst = insts[i];275276if (QPU_GET_FIELD(inst, QPU_SIG) == QPU_SIG_SMALL_IMM &&277QPU_GET_FIELD(inst, QPU_SMALL_IMM) >=278QPU_SMALL_IMM_MUL_ROT) {279uint32_t mux_a = QPU_GET_FIELD(inst, QPU_MUL_A);280uint32_t mux_b = QPU_GET_FIELD(inst, QPU_MUL_B);281282/* "The full horizontal vector rotate is only283* available when both of the mul ALU input arguments284* are taken from accumulators r0-r3."285*/286if (mux_a > QPU_MUX_R3 || mux_b > QPU_MUX_R3) {287fail_instr(inst,288"MUL rotate using non-accumulator "289"input");290}291292if (QPU_GET_FIELD(inst, QPU_SMALL_IMM) ==293QPU_SMALL_IMM_MUL_ROT) {294/* "An instruction that does a vector rotate295* by r5 must not immediately follow an296* instruction that writes to r5."297*/298if (writes_reg(insts[i - 1], QPU_W_ACC5)) {299fail_instr(inst,300"vector rotate by r5 "301"immediately after r5 write");302}303}304305/* "An instruction that does a vector rotate must not306* immediately follow an instruction that writes to the307* accumulator that is being rotated."308*/309if (writes_reg(insts[i - 1], QPU_W_ACC0 + mux_a) ||310writes_reg(insts[i - 1], QPU_W_ACC0 + mux_b)) {311fail_instr(inst,312"vector rotate of value "313"written in previous instruction");314}315}316}317318/* "An instruction that does a vector rotate must not immediately319* follow an instruction that writes to the accumulator that is being320* rotated.321*322* XXX: TODO.323*/324325/* "After an instruction that does a TLB Z write, the multisample mask326* must not be read as an instruction input argument in the following327* two instruction. The TLB Z write instruction can, however, be328* followed immediately by a TLB color write."329*/330for (int i = 0; i < num_inst - 1; i++) {331uint64_t inst = insts[i];332if (writes_reg(inst, QPU_W_TLB_Z) &&333(reads_a_reg(insts[i + 1], QPU_R_MS_REV_FLAGS) ||334reads_a_reg(insts[i + 2], QPU_R_MS_REV_FLAGS))) {335fail_instr(inst, "TLB Z write followed by MS mask read");336}337}338339/*340* "A single instruction can only perform a maximum of one of the341* following closely coupled peripheral accesses in a single342* instruction: TMU write, TMU read, TLB write, TLB read, TLB343* combined color read and write, SFU write, Mutex read or Semaphore344* access."345*/346for (int i = 0; i < num_inst - 1; i++) {347uint64_t inst = insts[i];348349if (qpu_num_sf_accesses(inst) > 1)350fail_instr(inst, "Single instruction writes SFU twice");351}352353/* "The uniform base pointer can be written (from SIMD element 0) by354* the processor to reset the stream, there must be at least two355* nonuniform-accessing instructions following a pointer change356* before uniforms can be accessed once more."357*/358int last_unif_pointer_update = -3;359for (int i = 0; i < num_inst; i++) {360uint64_t inst = insts[i];361uint32_t waddr_add = QPU_GET_FIELD(inst, QPU_WADDR_ADD);362uint32_t waddr_mul = QPU_GET_FIELD(inst, QPU_WADDR_MUL);363364if (reads_reg(inst, QPU_R_UNIF) &&365i - last_unif_pointer_update <= 2) {366fail_instr(inst,367"uniform read too soon after pointer update");368}369370if (waddr_add == QPU_W_UNIFORMS_ADDRESS ||371waddr_mul == QPU_W_UNIFORMS_ADDRESS)372last_unif_pointer_update = i;373}374375if (threaded) {376bool last_thrsw_found = false;377bool scoreboard_locked = false;378int tex_samples_outstanding = 0;379int last_tex_samples_outstanding = 0;380int thrsw_ip = -1;381382for (int i = 0; i < num_inst; i++) {383uint64_t inst = insts[i];384uint32_t sig = QPU_GET_FIELD(inst, QPU_SIG);385386if (i == thrsw_ip) {387/* In order to get texture results back in the388* correct order, before a new thrsw we have389* to read all the texture results from before390* the previous thrsw.391*392* FIXME: Is collecting the remaining results393* during the delay slots OK, or should we do394* this at THRSW signal time?395*/396if (last_tex_samples_outstanding != 0) {397fail_instr(inst, "THRSW with texture "398"results from the previous "399"THRSW still in the FIFO.");400}401402last_tex_samples_outstanding =403tex_samples_outstanding;404tex_samples_outstanding = 0;405}406407if (qpu_inst_is_tlb(inst))408scoreboard_locked = true;409410switch (sig) {411case QPU_SIG_THREAD_SWITCH:412case QPU_SIG_LAST_THREAD_SWITCH:413/* No thread switching with the scoreboard414* locked. Doing so means we may deadlock415* when the other thread tries to lock416* scoreboard.417*/418if (scoreboard_locked) {419fail_instr(inst, "THRSW with the "420"scoreboard locked.");421}422423/* No thread switching after lthrsw, since424* lthrsw means that we get delayed until the425* other shader is ready for us to terminate.426*/427if (last_thrsw_found) {428fail_instr(inst, "THRSW after a "429"previous LTHRSW");430}431432if (sig == QPU_SIG_LAST_THREAD_SWITCH)433last_thrsw_found = true;434435/* No THRSW while we already have a THRSW436* queued.437*/438if (i < thrsw_ip) {439fail_instr(inst,440"THRSW with a THRSW queued.");441}442443thrsw_ip = i + 3;444break;445446case QPU_SIG_LOAD_TMU0:447case QPU_SIG_LOAD_TMU1:448if (last_tex_samples_outstanding == 0) {449fail_instr(inst, "TMU load with nothing "450"in the results fifo from "451"the previous THRSW.");452}453454last_tex_samples_outstanding--;455break;456}457458uint32_t waddr_add = QPU_GET_FIELD(inst, QPU_WADDR_ADD);459uint32_t waddr_mul = QPU_GET_FIELD(inst, QPU_WADDR_MUL);460if (waddr_add == QPU_W_TMU0_S ||461waddr_add == QPU_W_TMU1_S ||462waddr_mul == QPU_W_TMU0_S ||463waddr_mul == QPU_W_TMU1_S) {464tex_samples_outstanding++;465}466}467}468}469470471