Path: blob/21.2-virgl/src/gallium/frontends/clover/core/kernel.cpp
4572 views
//1// Copyright 2012 Francisco Jerez2//3// Permission is hereby granted, free of charge, to any person obtaining a4// copy of this software and associated documentation files (the "Software"),5// to deal in the Software without restriction, including without limitation6// the rights to use, copy, modify, merge, publish, distribute, sublicense,7// and/or sell copies of the Software, and to permit persons to whom the8// Software is furnished to do so, subject to the following conditions:9//10// The above copyright notice and this permission notice shall be included in11// all copies or substantial portions of the Software.12//13// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR14// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,15// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL16// THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR17// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,18// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR19// OTHER DEALINGS IN THE SOFTWARE.20//2122#include "core/kernel.hpp"23#include "core/resource.hpp"24#include "util/factor.hpp"25#include "util/u_math.h"26#include "pipe/p_context.h"2728using namespace clover;2930kernel::kernel(clover::program &prog, const std::string &name,31const std::vector<module::argument> &margs) :32program(prog), _name(name), exec(*this),33program_ref(prog._kernel_ref_counter) {34for (auto &marg : margs) {35if (marg.semantic == module::argument::general)36_args.emplace_back(argument::create(marg));37}38for (auto &dev : prog.devices()) {39auto &m = prog.build(dev).binary;40auto msym = find(name_equals(name), m.syms);41const auto f = id_type_equals(msym.section, module::section::data_constant);42if (!any_of(f, m.secs))43continue;4445auto mconst = find(f, m.secs);46auto rb = std::make_unique<root_buffer>(prog.context(), std::vector<cl_mem_properties>(),47CL_MEM_COPY_HOST_PTR | CL_MEM_READ_ONLY,48mconst.size, mconst.data.data());49_constant_buffers.emplace(&dev, std::move(rb));50}51}5253template<typename V>54static inline std::vector<uint>55pad_vector(command_queue &q, const V &v, uint x) {56std::vector<uint> w { v.begin(), v.end() };57w.resize(q.device().max_block_size().size(), x);58return w;59}6061void62kernel::launch(command_queue &q,63const std::vector<size_t> &grid_offset,64const std::vector<size_t> &grid_size,65const std::vector<size_t> &block_size) {66const auto m = program().build(q.device()).binary;67const auto reduced_grid_size =68map(divides(), grid_size, block_size);69void *st = exec.bind(&q, grid_offset);70struct pipe_grid_info info = {};7172// The handles are created during exec_context::bind(), so we need make73// sure to call exec_context::bind() before retrieving them.74std::vector<uint32_t *> g_handles = map([&](size_t h) {75return (uint32_t *)&exec.input[h];76}, exec.g_handles);7778q.pipe->bind_compute_state(q.pipe, st);79q.pipe->bind_sampler_states(q.pipe, PIPE_SHADER_COMPUTE,800, exec.samplers.size(),81exec.samplers.data());8283q.pipe->set_sampler_views(q.pipe, PIPE_SHADER_COMPUTE, 0,84exec.sviews.size(), 0, exec.sviews.data());85q.pipe->set_shader_images(q.pipe, PIPE_SHADER_COMPUTE, 0,86exec.iviews.size(), 0, exec.iviews.data());87q.pipe->set_compute_resources(q.pipe, 0, exec.resources.size(),88exec.resources.data());89q.pipe->set_global_binding(q.pipe, 0, exec.g_buffers.size(),90exec.g_buffers.data(), g_handles.data());9192// Fill information for the launch_grid() call.93info.work_dim = grid_size.size();94copy(pad_vector(q, block_size, 1), info.block);95copy(pad_vector(q, reduced_grid_size, 1), info.grid);96info.pc = find(name_equals(_name), m.syms).offset;97info.input = exec.input.data();9899q.pipe->launch_grid(q.pipe, &info);100101q.pipe->set_global_binding(q.pipe, 0, exec.g_buffers.size(), NULL, NULL);102q.pipe->set_compute_resources(q.pipe, 0, exec.resources.size(), NULL);103q.pipe->set_shader_images(q.pipe, PIPE_SHADER_COMPUTE, 0,1040, exec.iviews.size(), NULL);105q.pipe->set_sampler_views(q.pipe, PIPE_SHADER_COMPUTE, 0,1060, exec.sviews.size(), NULL);107q.pipe->bind_sampler_states(q.pipe, PIPE_SHADER_COMPUTE, 0,108exec.samplers.size(), NULL);109110q.pipe->memory_barrier(q.pipe, PIPE_BARRIER_GLOBAL_BUFFER);111exec.unbind();112}113114size_t115kernel::mem_local() const {116size_t sz = 0;117118for (auto &arg : args()) {119if (dynamic_cast<local_argument *>(&arg))120sz += arg.storage();121}122123return sz;124}125126size_t127kernel::mem_private() const {128return 0;129}130131const std::string &132kernel::name() const {133return _name;134}135136std::vector<size_t>137kernel::optimal_block_size(const command_queue &q,138const std::vector<size_t> &grid_size) const {139return factor::find_grid_optimal_factor<size_t>(140q.device().max_threads_per_block(), q.device().max_block_size(),141grid_size);142}143144std::vector<size_t>145kernel::required_block_size() const {146return find(name_equals(_name), program().symbols()).reqd_work_group_size;147}148149kernel::argument_range150kernel::args() {151return map(derefs(), _args);152}153154kernel::const_argument_range155kernel::args() const {156return map(derefs(), _args);157}158159std::vector<clover::module::arg_info>160kernel::args_infos() {161std::vector<clover::module::arg_info> infos;162for (auto &marg: find(name_equals(_name), program().symbols()).args)163if (marg.semantic == clover::module::argument::general)164infos.emplace_back(marg.info);165166return infos;167}168169const module &170kernel::module(const command_queue &q) const {171return program().build(q.device()).binary;172}173174kernel::exec_context::exec_context(kernel &kern) :175kern(kern), q(NULL), print_handler(), mem_local(0), st(NULL), cs() {176}177178kernel::exec_context::~exec_context() {179if (st)180q->pipe->delete_compute_state(q->pipe, st);181}182183void *184kernel::exec_context::bind(intrusive_ptr<command_queue> _q,185const std::vector<size_t> &grid_offset) {186std::swap(q, _q);187188// Bind kernel arguments.189auto &m = kern.program().build(q->device()).binary;190auto msym = find(name_equals(kern.name()), m.syms);191auto margs = msym.args;192auto msec = find(id_type_equals(msym.section, module::section::text_executable), m.secs);193auto explicit_arg = kern._args.begin();194195for (auto &marg : margs) {196switch (marg.semantic) {197case module::argument::general:198(*(explicit_arg++))->bind(*this, marg);199break;200201case module::argument::grid_dimension: {202const cl_uint dimension = grid_offset.size();203auto arg = argument::create(marg);204205arg->set(sizeof(dimension), &dimension);206arg->bind(*this, marg);207break;208}209case module::argument::grid_offset: {210for (cl_uint x : pad_vector(*q, grid_offset, 0)) {211auto arg = argument::create(marg);212213arg->set(sizeof(x), &x);214arg->bind(*this, marg);215}216break;217}218case module::argument::image_size: {219auto img = dynamic_cast<image_argument &>(**(explicit_arg - 1)).get();220std::vector<cl_uint> image_size{221static_cast<cl_uint>(img->width()),222static_cast<cl_uint>(img->height()),223static_cast<cl_uint>(img->depth())};224for (auto x : image_size) {225auto arg = argument::create(marg);226227arg->set(sizeof(x), &x);228arg->bind(*this, marg);229}230break;231}232case module::argument::image_format: {233auto img = dynamic_cast<image_argument &>(**(explicit_arg - 1)).get();234cl_image_format fmt = img->format();235std::vector<cl_uint> image_format{236static_cast<cl_uint>(fmt.image_channel_data_type),237static_cast<cl_uint>(fmt.image_channel_order)};238for (auto x : image_format) {239auto arg = argument::create(marg);240241arg->set(sizeof(x), &x);242arg->bind(*this, marg);243}244break;245}246case module::argument::constant_buffer: {247auto arg = argument::create(marg);248cl_mem buf = kern._constant_buffers.at(&q->device()).get();249arg->set(sizeof(buf), &buf);250arg->bind(*this, marg);251break;252}253case module::argument::printf_buffer: {254print_handler = printf_handler::create(q, m.printf_infos,255m.printf_strings_in_buffer,256q->device().max_printf_buffer_size());257cl_mem print_mem = print_handler->get_mem();258259auto arg = argument::create(marg);260arg->set(sizeof(cl_mem), &print_mem);261arg->bind(*this, marg);262break;263}264}265}266267// Create a new compute state if anything changed.268if (!st || q != _q ||269cs.req_local_mem != mem_local ||270cs.req_input_mem != input.size()) {271if (st)272_q->pipe->delete_compute_state(_q->pipe, st);273274cs.ir_type = q->device().ir_format();275cs.prog = &(msec.data[0]);276cs.req_local_mem = mem_local;277cs.req_input_mem = input.size();278st = q->pipe->create_compute_state(q->pipe, &cs);279if (!st) {280unbind(); // Cleanup281throw error(CL_OUT_OF_RESOURCES);282}283}284285return st;286}287288void289kernel::exec_context::unbind() {290if (print_handler)291print_handler->print();292293for (auto &arg : kern.args())294arg.unbind(*this);295296input.clear();297samplers.clear();298sviews.clear();299iviews.clear();300resources.clear();301g_buffers.clear();302g_handles.clear();303mem_local = 0;304}305306namespace {307template<typename T>308std::vector<uint8_t>309bytes(const T& x) {310return { (uint8_t *)&x, (uint8_t *)&x + sizeof(x) };311}312313///314/// Transform buffer \a v from the native byte order into the byte315/// order specified by \a e.316///317template<typename T>318void319byteswap(T &v, pipe_endian e) {320if (PIPE_ENDIAN_NATIVE != e)321std::reverse(v.begin(), v.end());322}323324///325/// Pad buffer \a v to the next multiple of \a n.326///327template<typename T>328void329align(T &v, size_t n) {330v.resize(util_align_npot(v.size(), n));331}332333bool334msb(const std::vector<uint8_t> &s) {335if (PIPE_ENDIAN_NATIVE == PIPE_ENDIAN_LITTLE)336return s.back() & 0x80;337else338return s.front() & 0x80;339}340341///342/// Resize buffer \a v to size \a n using sign or zero extension343/// according to \a ext.344///345template<typename T>346void347extend(T &v, enum module::argument::ext_type ext, size_t n) {348const size_t m = std::min(v.size(), n);349const bool sign_ext = (ext == module::argument::sign_ext);350const uint8_t fill = (sign_ext && msb(v) ? ~0 : 0);351T w(n, fill);352353if (PIPE_ENDIAN_NATIVE == PIPE_ENDIAN_LITTLE)354std::copy_n(v.begin(), m, w.begin());355else356std::copy_n(v.end() - m, m, w.end() - m);357358std::swap(v, w);359}360361///362/// Append buffer \a w to \a v.363///364template<typename T>365void366insert(T &v, const T &w) {367v.insert(v.end(), w.begin(), w.end());368}369370///371/// Append \a n elements to the end of buffer \a v.372///373template<typename T>374size_t375allocate(T &v, size_t n) {376size_t pos = v.size();377v.resize(pos + n);378return pos;379}380}381382std::unique_ptr<kernel::argument>383kernel::argument::create(const module::argument &marg) {384switch (marg.type) {385case module::argument::scalar:386return std::unique_ptr<kernel::argument>(new scalar_argument(marg.size));387388case module::argument::global:389return std::unique_ptr<kernel::argument>(new global_argument);390391case module::argument::local:392return std::unique_ptr<kernel::argument>(new local_argument);393394case module::argument::constant:395return std::unique_ptr<kernel::argument>(new constant_argument);396397case module::argument::image_rd:398return std::unique_ptr<kernel::argument>(new image_rd_argument);399400case module::argument::image_wr:401return std::unique_ptr<kernel::argument>(new image_wr_argument);402403case module::argument::sampler:404return std::unique_ptr<kernel::argument>(new sampler_argument);405406}407throw error(CL_INVALID_KERNEL_DEFINITION);408}409410kernel::argument::argument() : _set(false) {411}412413bool414kernel::argument::set() const {415return _set;416}417418size_t419kernel::argument::storage() const {420return 0;421}422423kernel::scalar_argument::scalar_argument(size_t size) : size(size) {424}425426void427kernel::scalar_argument::set(size_t size, const void *value) {428if (!value)429throw error(CL_INVALID_ARG_VALUE);430431if (size != this->size)432throw error(CL_INVALID_ARG_SIZE);433434v = { (uint8_t *)value, (uint8_t *)value + size };435_set = true;436}437438void439kernel::scalar_argument::bind(exec_context &ctx,440const module::argument &marg) {441auto w = v;442443extend(w, marg.ext_type, marg.target_size);444byteswap(w, ctx.q->device().endianness());445align(ctx.input, marg.target_align);446insert(ctx.input, w);447}448449void450kernel::scalar_argument::unbind(exec_context &ctx) {451}452453kernel::global_argument::global_argument() : buf(nullptr), svm(nullptr) {454}455456void457kernel::global_argument::set(size_t size, const void *value) {458if (size != sizeof(cl_mem))459throw error(CL_INVALID_ARG_SIZE);460461buf = pobj<buffer>(value ? *(cl_mem *)value : NULL);462svm = nullptr;463_set = true;464}465466void467kernel::global_argument::set_svm(const void *value) {468svm = value;469buf = nullptr;470_set = true;471}472473void474kernel::global_argument::bind(exec_context &ctx,475const module::argument &marg) {476align(ctx.input, marg.target_align);477478if (buf) {479const resource &r = buf->resource_in(*ctx.q);480ctx.g_handles.push_back(ctx.input.size());481ctx.g_buffers.push_back(r.pipe);482483// How to handle multi-demensional offsets?484// We don't need to. Buffer offsets are always485// one-dimensional.486auto v = bytes(r.offset[0]);487extend(v, marg.ext_type, marg.target_size);488byteswap(v, ctx.q->device().endianness());489insert(ctx.input, v);490} else if (svm) {491auto v = bytes(svm);492extend(v, marg.ext_type, marg.target_size);493byteswap(v, ctx.q->device().endianness());494insert(ctx.input, v);495} else {496// Null pointer.497allocate(ctx.input, marg.target_size);498}499}500501void502kernel::global_argument::unbind(exec_context &ctx) {503}504505size_t506kernel::local_argument::storage() const {507return _storage;508}509510void511kernel::local_argument::set(size_t size, const void *value) {512if (value)513throw error(CL_INVALID_ARG_VALUE);514515if (!size)516throw error(CL_INVALID_ARG_SIZE);517518_storage = size;519_set = true;520}521522void523kernel::local_argument::bind(exec_context &ctx,524const module::argument &marg) {525auto v = bytes(ctx.mem_local);526527extend(v, module::argument::zero_ext, marg.target_size);528byteswap(v, ctx.q->device().endianness());529align(ctx.input, marg.target_align);530insert(ctx.input, v);531532ctx.mem_local += _storage;533}534535void536kernel::local_argument::unbind(exec_context &ctx) {537}538539kernel::constant_argument::constant_argument() : buf(nullptr), st(nullptr) {540}541542void543kernel::constant_argument::set(size_t size, const void *value) {544if (size != sizeof(cl_mem))545throw error(CL_INVALID_ARG_SIZE);546547buf = pobj<buffer>(value ? *(cl_mem *)value : NULL);548_set = true;549}550551void552kernel::constant_argument::bind(exec_context &ctx,553const module::argument &marg) {554align(ctx.input, marg.target_align);555556if (buf) {557resource &r = buf->resource_in(*ctx.q);558auto v = bytes(ctx.resources.size() << 24 | r.offset[0]);559560extend(v, module::argument::zero_ext, marg.target_size);561byteswap(v, ctx.q->device().endianness());562insert(ctx.input, v);563564st = r.bind_surface(*ctx.q, false);565ctx.resources.push_back(st);566} else {567// Null pointer.568allocate(ctx.input, marg.target_size);569}570}571572void573kernel::constant_argument::unbind(exec_context &ctx) {574if (buf)575buf->resource_in(*ctx.q).unbind_surface(*ctx.q, st);576}577578void579kernel::image_rd_argument::set(size_t size, const void *value) {580if (!value)581throw error(CL_INVALID_ARG_VALUE);582583if (size != sizeof(cl_mem))584throw error(CL_INVALID_ARG_SIZE);585586img = &obj<image>(*(cl_mem *)value);587_set = true;588}589590void591kernel::image_rd_argument::bind(exec_context &ctx,592const module::argument &marg) {593auto v = bytes(ctx.sviews.size());594595extend(v, module::argument::zero_ext, marg.target_size);596byteswap(v, ctx.q->device().endianness());597align(ctx.input, marg.target_align);598insert(ctx.input, v);599600st = img->resource_in(*ctx.q).bind_sampler_view(*ctx.q);601ctx.sviews.push_back(st);602}603604void605kernel::image_rd_argument::unbind(exec_context &ctx) {606img->resource_in(*ctx.q).unbind_sampler_view(*ctx.q, st);607}608609void610kernel::image_wr_argument::set(size_t size, const void *value) {611if (!value)612throw error(CL_INVALID_ARG_VALUE);613614if (size != sizeof(cl_mem))615throw error(CL_INVALID_ARG_SIZE);616617img = &obj<image>(*(cl_mem *)value);618_set = true;619}620621void622kernel::image_wr_argument::bind(exec_context &ctx,623const module::argument &marg) {624auto v = bytes(ctx.iviews.size());625626extend(v, module::argument::zero_ext, marg.target_size);627byteswap(v, ctx.q->device().endianness());628align(ctx.input, marg.target_align);629insert(ctx.input, v);630ctx.iviews.push_back(img->resource_in(*ctx.q).create_image_view(*ctx.q));631}632633void634kernel::image_wr_argument::unbind(exec_context &ctx) {635}636637kernel::sampler_argument::sampler_argument() : s(nullptr), st(nullptr) {638}639640void641kernel::sampler_argument::set(size_t size, const void *value) {642if (!value)643throw error(CL_INVALID_SAMPLER);644645if (size != sizeof(cl_sampler))646throw error(CL_INVALID_ARG_SIZE);647648s = &obj(*(cl_sampler *)value);649_set = true;650}651652void653kernel::sampler_argument::bind(exec_context &ctx,654const module::argument &marg) {655st = s->bind(*ctx.q);656ctx.samplers.push_back(st);657}658659void660kernel::sampler_argument::unbind(exec_context &ctx) {661s->unbind(*ctx.q, st);662}663664665