Path: blob/master/modules/core/src/intel_gpu_gemm.inl.hpp
16337 views
/*1* Copyright 2015-2017 Philippe Tillet2* Copyright (c) 2017, Intel Corporation3*4* Permission is hereby granted, free of charge, to any person obtaining5* a copy of this software and associated documentation files6* (the "Software"), to deal in the Software without restriction,7* including without limitation the rights to use, copy, modify, merge,8* publish, distribute, sublicense, and/or sell copies of the Software,9* and to permit persons to whom the Software is furnished to do so,10* subject to the following conditions:11*12* The above copyright notice and this permission notice shall be13* included in all copies or substantial portions of the Software.14*15* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,16* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF17* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.18* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY19* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,20* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE21* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.22*/2324#ifdef HAVE_OPENCL2526#include <sstream>27#include "precomp.hpp"28#include "opencl_kernels_core.hpp"29#include "opencv2/core/opencl/runtime/opencl_clamdblas.hpp"30#include "opencv2/core/opencl/runtime/opencl_core.hpp"3132namespace cv33{3435static bool intel_gpu_gemm(36UMat A, Size sizeA,37UMat B, Size sizeB,38UMat D, Size sizeD,39double alpha, double beta,40bool atrans, bool btrans)41{42CV_UNUSED(sizeB);4344int M = sizeD.height, N = sizeD.width, K = ((atrans)? sizeA.height : sizeA.width);4546std::string kernelName;47bool ret = true;4849size_t lx = 8, ly = 4;50size_t dx = 4, dy = 8;5152if(!atrans && !btrans)53{5455if (M % 32 == 0 && N % 32 == 0 && K % 16 == 0)56{57kernelName = "intelblas_gemm_buffer_NN_sp";58}59else60{61kernelName = "intelblas_gemm_buffer_NN";62}63}64else if(atrans && !btrans)65{66kernelName = "intelblas_gemm_buffer_TN";67}68else if(!atrans && btrans)69{70kernelName = "intelblas_gemm_buffer_NT";71ly = 16;72dx = 1;73}74else75{76kernelName = "intelblas_gemm_buffer_TT";77}7879const size_t gx = (size_t)(N + dx - 1) / dx;80const size_t gy = (size_t)(M + dy - 1) / dy;8182size_t local[] = {lx, ly, 1};83size_t global[] = {(gx + lx - 1) / lx * lx, (gy + ly - 1) / ly * ly, 1};8485int stride = (M * N < 1024 * 1024) ? 10000000 : 256;8687ocl::Queue q;88String errmsg;89const ocl::Program program = ocl::Context::getDefault().getProg(ocl::core::intel_gemm_oclsrc, "", errmsg);9091if(!atrans && btrans)92{93ocl::Kernel k(kernelName.c_str(), program);94if (k.empty())95{96return false;97}9899k.args(ocl::KernelArg::PtrReadOnly(A),100(int) (A.offset / sizeof(float)),101ocl::KernelArg::PtrReadOnly(B),102(int) (B.offset / sizeof(float)),103ocl::KernelArg::PtrWriteOnly(D),104(int) (D.offset / sizeof(float)),105M, N, K,106(float)alpha,107(float)beta,108(int)(A.step / sizeof(float)),109(int)(B.step / sizeof(float)),110(int)(D.step / sizeof(float))111);112113ret = k.run(2, global, local, false, q);114}115else116{117for(int start_index = 0; start_index < K; start_index += stride)118{119ocl::Kernel k(kernelName.c_str(), program);120k.args(ocl::KernelArg::PtrReadOnly(A),121(int) (A.offset / sizeof(float)),122ocl::KernelArg::PtrReadOnly(B),123(int) (B.offset / sizeof(float)),124ocl::KernelArg::PtrWriteOnly(D),125(int) (D.offset / sizeof(float)),126M, N, K,127(float)alpha,128(float)beta,129(int)(A.step / sizeof(float)),130(int)(B.step / sizeof(float)),131(int)(D.step / sizeof(float)),132(int) start_index, // 14 start_index133stride);134135ret = k.run(2, global, local, false, q);136if (!ret) return ret;137}138}139140return ret;141}142143} // namespace cv144145#endif146147148