Path: blob/master/runtime/compiler/codegen/CodeGenGPU.cpp
6000 views
/*******************************************************************************1* Copyright (c) 2000, 2021 IBM Corp. and others2*3* This program and the accompanying materials are made available under4* the terms of the Eclipse Public License 2.0 which accompanies this5* distribution and is available at https://www.eclipse.org/legal/epl-2.0/6* or the Apache License, Version 2.0 which accompanies this distribution and7* is available at https://www.apache.org/licenses/LICENSE-2.0.8*9* This Source Code may also be made available under the following10* Secondary Licenses when the conditions for such availability set11* forth in the Eclipse Public License, v. 2.0 are satisfied: GNU12* General Public License, version 2 with the GNU Classpath13* Exception [1] and GNU General Public License, version 2 with the14* OpenJDK Assembly Exception [2].15*16* [1] https://www.gnu.org/software/classpath/license.html17* [2] http://openjdk.java.net/legal/assembly-exception.html18*19* SPDX-License-Identifier: EPL-2.0 OR Apache-2.0 OR GPL-2.0 WITH Classpath-exception-2.0 OR LicenseRef-GPL-2.0 WITH Assembly-exception20*******************************************************************************/2122/**23* Support code for TR::CodeGenerator. Code related to generating GPU24*/2526#include "codegen/CodeGenerator.hpp"27#include "codegen/CodeGenerator_inlines.hpp"28#include "codegen/RecognizedMethods.hpp"29#include "il/AutomaticSymbol.hpp"30#include "il/Block.hpp"31#include "il/Node.hpp"32#include "il/Node_inlines.hpp"33#include "il/ParameterSymbol.hpp"34#include "il/TreeTop.hpp"35#include "il/TreeTop_inlines.hpp"36#include "infra/String.hpp"37#include "env/CompilerEnv.hpp"38#include "env/StackMemoryRegion.hpp"39#include "env/annotations/GPUAnnotation.hpp"40#include "optimizer/Dominators.hpp"41#include "optimizer/Structure.hpp"42#include "omrformatconsts.h"4344#define OPT_DETAILS "O^O CODE GENERATION: "4546static const char* getOpCodeName(TR::ILOpCodes opcode) {4748TR_ASSERT(opcode < TR::NumIlOps, "Wrong opcode");4950switch(opcode)51{52case TR::iload:53case TR::fload:54case TR::dload:55case TR::aload:56case TR::bload:57case TR::sload:58case TR::lload:59case TR::iloadi:60case TR::floadi:61case TR::dloadi:62case TR::aloadi:63case TR::bloadi:64case TR::sloadi:65case TR::lloadi:66return "load";6768case TR::istore:69case TR::lstore:70case TR::fstore:71case TR::dstore:72case TR::astore:73case TR::bstore:74case TR::sstore:75case TR::lstorei:76case TR::fstorei:77case TR::dstorei:78case TR::astorei:79case TR::bstorei:80case TR::sstorei:81case TR::istorei:82return "store";8384case TR::Goto:85return "br";8687case TR::ireturn:88case TR::lreturn:89case TR::freturn:90case TR::dreturn:91case TR::areturn:92case TR::Return:93return "ret";9495case TR::iadd:96case TR::ladd:97case TR::badd:98case TR::sadd:99return "add";100101case TR::fadd:102case TR::dadd:103return "fadd";104105case TR::isub:106case TR::lsub:107case TR::bsub:108case TR::ssub:109case TR::ineg:110case TR::lneg:111case TR::bneg:112case TR::sneg:113return "sub";114115case TR::dsub:116case TR::fsub:117case TR::fneg:118case TR::dneg:119return "fsub";120121case TR::imul:122case TR::lmul:123case TR::bmul:124case TR::smul:125return "mul";126127case TR::fmul:128case TR::dmul:129return "fmul";130131case TR::idiv:132case TR::ldiv:133case TR::bdiv:134case TR::sdiv:135return "sdiv";136137case TR::fdiv:138case TR::ddiv:139return "fdiv";140141case TR::iudiv:142case TR::ludiv:143return "udiv";144145case TR::irem:146case TR::lrem:147case TR::brem:148case TR::srem:149return "srem";150151case TR::frem:152case TR::drem:153return "frem";154155case TR::iurem:156return "urem";157158case TR::ishl:159case TR::lshl:160case TR::bshl:161case TR::sshl:162return "shl";163164case TR::ishr:165case TR::lshr:166case TR::bshr:167case TR::sshr:168return "ashr";169170case TR::iushr:171case TR::lushr:172case TR::bushr:173case TR::sushr:174return "lshr";175176case TR::iand:177case TR::land:178case TR::band:179case TR::sand:180return "and";181182case TR::ior:183case TR::lor:184case TR::bor:185case TR::sor:186return "or";187188case TR::ixor:189case TR::lxor:190case TR::bxor:191case TR::sxor:192return "xor";193194case TR::i2l:195case TR::b2i:196case TR::b2l:197case TR::b2s:198case TR::s2i:199case TR::s2l:200return "sext";201202case TR::i2f:203case TR::i2d:204case TR::l2f:205case TR::l2d:206case TR::b2f:207case TR::b2d:208case TR::s2f:209case TR::s2d:210return "sitofp";211212case TR::i2b:213case TR::i2s:214case TR::l2i:215case TR::l2b:216case TR::l2s:217case TR::s2b:218return "trunc";219220case TR::l2a:221case TR::i2a:222case TR::s2a:223case TR::b2a:224case TR::lu2a:225case TR::iu2a:226case TR::su2a:227case TR::bu2a:228return "inttoptr";229230case TR::iu2l:231case TR::bu2i:232case TR::bu2l:233case TR::bu2s:234case TR::su2i:235case TR::su2l:236return "zext";237238case TR::iu2f:239case TR::iu2d:240case TR::lu2f:241case TR::lu2d:242case TR::bu2f:243case TR::bu2d:244case TR::su2f:245case TR::su2d:246return "uitofp";247248case TR::f2i:249case TR::f2l:250case TR::f2b:251case TR::f2s:252case TR::d2i:253case TR::d2l:254case TR::d2b:255case TR::d2s:256return "fptosi";257258case TR::f2d:259return "fpext";260261case TR::d2f:262return "fptrunc";263264case TR::a2i:265case TR::a2l:266case TR::a2b:267case TR::a2s:268return "ptrtoint";269270case TR::icmpeq:271case TR::lcmpeq:272case TR::acmpeq:273case TR::bcmpeq:274case TR::scmpeq:275case TR::ificmpeq:276case TR::iflcmpeq:277case TR::ifacmpeq:278case TR::ifbcmpeq:279case TR::ifscmpeq:280return "icmp eq";281282case TR::icmpne:283case TR::lcmpne:284case TR::acmpne:285case TR::bcmpne:286case TR::scmpne:287case TR::ificmpne:288case TR::iflcmpne:289case TR::ifacmpne:290case TR::ifbcmpne:291case TR::ifscmpne:292return "icmp ne";293294case TR::icmplt:295case TR::lcmplt:296case TR::bcmplt:297case TR::scmplt:298case TR::ificmplt:299case TR::iflcmplt:300case TR::ifbcmplt:301case TR::ifscmplt:302return "icmp slt";303304case TR::icmpge:305case TR::lcmpge:306case TR::bcmpge:307case TR::scmpge:308case TR::ificmpge:309case TR::iflcmpge:310case TR::ifbcmpge:311case TR::ifscmpge:312return "icmp sge";313314case TR::icmpgt:315case TR::lcmpgt:316case TR::bcmpgt:317case TR::scmpgt:318case TR::ificmpgt:319case TR::iflcmpgt:320case TR::ifbcmpgt:321case TR::ifscmpgt:322return "icmp sgt";323324case TR::icmple:325case TR::lcmple:326case TR::bcmple:327case TR::scmple:328case TR::ificmple:329case TR::iflcmple:330case TR::ifbcmple:331case TR::ifscmple:332return "icmp sle";333334case TR::acmplt:335case TR::iucmplt:336case TR::lucmplt:337case TR::bucmplt:338case TR::sucmplt:339case TR::ifacmplt:340case TR::ifiucmplt:341case TR::iflucmplt:342case TR::ifbucmplt:343case TR::ifsucmplt:344return "icmp ult";345346case TR::acmpge:347case TR::iucmpge:348case TR::bucmpge:349case TR::lucmpge:350case TR::sucmpge:351case TR::ifacmpge:352case TR::ifiucmpge:353case TR::iflucmpge:354case TR::ifbucmpge:355case TR::ifsucmpge:356return "icmp uge";357358case TR::acmpgt:359case TR::iucmpgt:360case TR::lucmpgt:361case TR::bucmpgt:362case TR::sucmpgt:363case TR::ifacmpgt:364case TR::ifiucmpgt:365case TR::iflucmpgt:366case TR::ifbucmpgt:367case TR::ifsucmpgt:368return "icmp ugt";369370case TR::acmple:371case TR::iucmple:372case TR::lucmple:373case TR::bucmple:374case TR::sucmple:375case TR::ifacmple:376case TR::ifiucmple:377case TR::iflucmple:378case TR::ifbucmple:379case TR::ifsucmple:380return "icmp ule";381382case TR::fcmpeq:383case TR::dcmpeq:384case TR::iffcmpeq:385case TR::ifdcmpeq:386return "fcmp oeq";387388case TR::fcmpne:389case TR::dcmpne:390case TR::iffcmpne:391case TR::ifdcmpne:392return "fcmp one";393394case TR::fcmplt:395case TR::dcmplt:396case TR::iffcmplt:397case TR::ifdcmplt:398return "fcmp olt";399400case TR::fcmpge:401case TR::dcmpge:402case TR::iffcmpge:403case TR::ifdcmpge:404return "fcmp oge";405406case TR::fcmpgt:407case TR::dcmpgt:408case TR::iffcmpgt:409case TR::ifdcmpgt:410return "fcmp ogt";411412case TR::fcmple:413case TR::dcmple:414case TR::iffcmple:415case TR::ifdcmple:416return "fcmp ole";417418case TR::fcmpequ:419case TR::dcmpequ:420case TR::iffcmpequ:421case TR::ifdcmpequ:422return "fcmp ueq";423424case TR::fcmpneu:425case TR::dcmpneu:426case TR::iffcmpneu:427case TR::ifdcmpneu:428return "fcmp une";429430case TR::fcmpltu:431case TR::dcmpltu:432case TR::iffcmpltu:433case TR::ifdcmpltu:434return "fcmp ult";435436case TR::fcmpgeu:437case TR::dcmpgeu:438case TR::iffcmpgeu:439case TR::ifdcmpgeu:440return "fcmp uge";441442case TR::fcmpgtu:443case TR::dcmpgtu:444case TR::iffcmpgtu:445case TR::ifdcmpgtu:446return "fcmp ugt";447448case TR::fcmpleu:449case TR::dcmpleu:450case TR::iffcmpleu:451case TR::ifdcmpleu:452return "fcmp ule";453454case TR::d2c:455case TR::f2c:456case TR::f2bu:457case TR::f2iu:458case TR::f2lu:459case TR::d2iu:460case TR::d2lu:461case TR::d2bu:462return "fptoui";463464case TR::aiadd:465case TR::aladd:466return "getelementptr";467468case TR::ibits2f:469case TR::fbits2i:470case TR::lbits2d:471case TR::dbits2l:472return "bitcast";473474case TR::lookup:475case TR::table:476return "switch";477478case TR::BBStart:479case TR::BBEnd:480return "";481482case TR::newarray:483return "INVALID";484485default:486return NULL;487}488489}490491492char *nvvmTypeNames[TR::NumTypes] =493{494"void", // "TR::NoType"495"i8", // "TR::Int8"496"i16", // "TR::Int16"497"i32", // "TR::Int32"498"i64", // "TR::Int64"499"float", // "TR::Float"500"double", // "TR::Double"501"i8*" // "TR::Address"502};503504static const char* getTypeName(TR::DataType type) {505if (type >= TR::NoType && type <= TR::Address)506{507return nvvmTypeNames[type];508}509else510{511TR_ASSERT(false, "Unsupported type");512return "???";513}514}515516char *nvvmVarTypeNames[TR::NumTypes] =517{518"void", // "TR::NoType"519"i8", // "TR::Int8"520"i16", // "TR::Int16"521"i32", // "TR::Int32"522"i64", // "TR::Int64"523"f32", // "TR::Float"524"f64", // "TR::Double"525"p64" // "TR::Address"526};527528static const char* getVarTypeName(TR::DataType type) {529if (type >= TR::NoType && type <= TR::Address)530{531return nvvmVarTypeNames[type];532}533else534{535TR_ASSERT(false, "Unsupported type");536return "???";537}538}539540#define MAX_NAME 256541542543static void getParmName(int32_t slot, char * s, bool addr = true)544{545TR::snprintfNoTrunc(s, MAX_NAME, "%%p%" OMR_PRId32 "%s", slot, addr ? ".addr" : "");546}547548549static void getAutoOrParmName(TR::Symbol *sym, char * s, bool addr = true)550{551TR_ASSERT(sym->isAutoOrParm(), "expecting auto or parm");552553if (sym->isParm())554TR::snprintfNoTrunc(s, MAX_NAME, "%%p%" OMR_PRId32 "%s", sym->castToParmSymbol()->getSlot(), addr ? ".addr" : "");555else556TR::snprintfNoTrunc(s, MAX_NAME, "%%a%" OMR_PRId32 "%s", sym->castToAutoSymbol()->getLiveLocalIndex(), addr ? ".addr" : "");557}558559560#define INIT_BUFFER_SIZE 65535561562class NVVMIRBuffer563{564public:565NVVMIRBuffer(TR_Memory* mem)566{567m = mem;568size = INIT_BUFFER_SIZE;569buffer = (char*)m->allocateHeapMemory(size);570s = buffer;571}572void print(char *format, ...)573{574va_list args;575va_start (args, format);576int32_t left = size - (s - buffer);577578va_list args_copy;579va_copy(args_copy, args);580int32_t len = vsnprintf(s, left, format, args_copy);581va_copy_end(args_copy);582583if ((len + 1) > left)584{585expand(len + 1 - left);586left = size - (s - buffer);587len = vsnprintf(s, left, format, args);588}589590s += len;591va_end(args);592}593594char * getString() { return buffer; }595596private:597598void expand(int32_t min)599{600size += (min >= size) ? size*2 : size;601602char * newBuffer = (char*)m->allocateHeapMemory(size);603memcpy(newBuffer, buffer, s - buffer);604s = newBuffer + (s - buffer);605buffer = newBuffer;606}607608char *buffer;609char *s;610int32_t size;611TR_Memory* m;612};613614615static void getNodeName(TR::Node* node, char * s, TR::Compilation *comp)616{617if (node->getOpCode().isLoadConst())618{619bool isUnsigned = node->getOpCode().isUnsigned();620switch (node->getDataType())621{622case TR::Int8:623if(isUnsigned)624TR::snprintfNoTrunc(s, MAX_NAME, "%" OMR_PRIu8, node->getUnsignedByte());625else626TR::snprintfNoTrunc(s, MAX_NAME, "%" OMR_PRId8, node->getByte());627break;628case TR::Int16:629TR::snprintfNoTrunc(s, MAX_NAME, "%" OMR_PRIu16, node->getConst<uint16_t>());630break;631case TR::Int32:632if(isUnsigned)633TR::snprintfNoTrunc(s, MAX_NAME, "%" OMR_PRIu32, node->getUnsignedInt());634else635TR::snprintfNoTrunc(s, MAX_NAME, "%" OMR_PRId32, node->getInt());636break;637case TR::Int64:638if(isUnsigned)639TR::snprintfNoTrunc(s, MAX_NAME, "%" OMR_PRIu64, node->getUnsignedLongInt());640else641TR::snprintfNoTrunc(s, MAX_NAME, "%" OMR_PRId64, node->getLongInt());642break;643case TR::Float:644union645{646double doubleValue;647int64_t doubleBits;648};649doubleValue = node->getFloat();650TR::snprintfNoTrunc(s, MAX_NAME, "0x%016" OMR_PRIx64, doubleBits);651break;652case TR::Double:653TR::snprintfNoTrunc(s, MAX_NAME, "0x%016" OMR_PRIx64, node->getDoubleBits());654break;655case TR::Address:656if (node->getAddress() == 0)657TR::snprintfNoTrunc(s, MAX_NAME, "null");658else659TR_ASSERT(0, "Non-null Address constants should not occur.\n");660break;661default:662TR_ASSERT(0, "Unknown/unimplemented data type\n");663}664}665else666{667TR::snprintfNoTrunc(s, MAX_NAME, "%%%" OMR_PRIu32, node->getLocalIndex());668}669}670671char* getNVVMMathFunctionName(TR::Node *node)672{673switch (((TR::MethodSymbol*)node->getSymbolReference()->getSymbol())->getRecognizedMethod())674{675case TR::java_lang_Math_sqrt:676return "sqrt";677case TR::java_lang_Math_sin:678case TR::java_lang_StrictMath_sin:679return "sin";680case TR::java_lang_Math_cos:681case TR::java_lang_StrictMath_cos:682return "cos";683case TR::java_lang_Math_log:684case TR::java_lang_StrictMath_log:685return "log";686case TR::java_lang_Math_exp:687case TR::java_lang_StrictMath_exp:688return "exp";689case TR::java_lang_Math_abs_F:690return "fabsf";691case TR::java_lang_Math_abs_D:692return "fabs";693default:694return "ERROR";695}696return "ERROR";697}698699bool J9::CodeGenerator::handleRecognizedMethod(TR::Node *node, NVVMIRBuffer &ir, TR::Compilation *comp)700{701char name0[MAX_NAME];702switch (((TR::MethodSymbol*)node->getSymbolReference()->getSymbol())->getRecognizedMethod())703{704case TR::com_ibm_gpu_Kernel_blockIdxX:705ir.print(" %%%d = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x()\n", node->getLocalIndex());706break;707case TR::com_ibm_gpu_Kernel_blockIdxY:708ir.print(" %%%d = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.y()\n", node->getLocalIndex());709break;710case TR::com_ibm_gpu_Kernel_blockIdxZ:711ir.print(" %%%d = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.z()\n", node->getLocalIndex());712break;713case TR::com_ibm_gpu_Kernel_blockDimX:714ir.print(" %%%d = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()\n", node->getLocalIndex());715break;716case TR::com_ibm_gpu_Kernel_blockDimY:717ir.print(" %%%d = call i32 @llvm.nvvm.read.ptx.sreg.ntid.y()\n", node->getLocalIndex());718break;719case TR::com_ibm_gpu_Kernel_blockDimZ:720ir.print(" %%%d = call i32 @llvm.nvvm.read.ptx.sreg.ntid.z()\n", node->getLocalIndex());721break;722case TR::com_ibm_gpu_Kernel_threadIdxX:723ir.print(" %%%d = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()\n", node->getLocalIndex());724break;725case TR::com_ibm_gpu_Kernel_threadIdxY:726ir.print(" %%%d = call i32 @llvm.nvvm.read.ptx.sreg.tid.y()\n", node->getLocalIndex());727break;728case TR::com_ibm_gpu_Kernel_threadIdxZ:729ir.print(" %%%d = call i32 @llvm.nvvm.read.ptx.sreg.tid.z()\n", node->getLocalIndex());730break;731case TR::com_ibm_gpu_Kernel_syncThreads:732ir.print(" call void @llvm.nvvm.barrier0()\n");733node->setLocalIndex(_gpuNodeCount--);734break;735case TR::java_lang_Math_sqrt:736case TR::java_lang_Math_sin:737case TR::java_lang_Math_cos:738case TR::java_lang_Math_log:739case TR::java_lang_Math_exp:740case TR::java_lang_Math_abs_D:741if (!comp->getOptions()->getEnableGPU(TR_EnableGPUEnableMath)) return false;742getNodeName(node->getChild(0), name0, comp);743ir.print(" %%%d = call double @__nv_%s(double %s)\n", node->getLocalIndex(), getNVVMMathFunctionName(node), name0);744break;745case TR::java_lang_StrictMath_sin:746case TR::java_lang_StrictMath_cos:747case TR::java_lang_StrictMath_log:748case TR::java_lang_StrictMath_exp:749if (!comp->getOptions()->getEnableGPU(TR_EnableGPUEnableMath)) return false;750getNodeName(node->getChild(1), name0, comp);751ir.print(" %%%d = call double @__nv_%s(double %s)\n", node->getLocalIndex(), getNVVMMathFunctionName(node), name0);752break;753case TR::java_lang_Math_abs_F:754if (!comp->getOptions()->getEnableGPU(TR_EnableGPUEnableMath)) return false;755getNodeName(node->getChild(0), name0, comp);756ir.print(" %%%d = call float @__nv_%s(float %s)\n", node->getLocalIndex(), getNVVMMathFunctionName(node), name0);757break;758default:759return false;760}761return true;762}763764765bool J9::CodeGenerator::handleRecognizedField(TR::Node *node, NVVMIRBuffer &ir)766{767switch (node->getSymbolReference()->getSymbol()->getRecognizedField())768{769case TR::Symbol::Com_ibm_gpu_Kernel_blockIdxX:770ir.print(" %%%d = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x()\n", node->getLocalIndex());771break;772case TR::Symbol::Com_ibm_gpu_Kernel_blockIdxY:773ir.print(" %%%d = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.y()\n", node->getLocalIndex());774break;775case TR::Symbol::Com_ibm_gpu_Kernel_blockIdxZ:776ir.print(" %%%d = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.z()\n", node->getLocalIndex());777break;778case TR::Symbol::Com_ibm_gpu_Kernel_blockDimX:779ir.print(" %%%d = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()\n", node->getLocalIndex());780break;781case TR::Symbol::Com_ibm_gpu_Kernel_blockDimY:782ir.print(" %%%d = call i32 @llvm.nvvm.read.ptx.sreg.ntid.y()\n", node->getLocalIndex());783break;784case TR::Symbol::Com_ibm_gpu_Kernel_blockDimZ:785ir.print(" %%%d = call i32 @llvm.nvvm.read.ptx.sreg.ntid.z()\n", node->getLocalIndex());786break;787case TR::Symbol::Com_ibm_gpu_Kernel_threadIdxX:788ir.print(" %%%d = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()\n", node->getLocalIndex());789break;790case TR::Symbol::Com_ibm_gpu_Kernel_threadIdxY:791ir.print(" %%%d = call i32 @llvm.nvvm.read.ptx.sreg.tid.y()\n", node->getLocalIndex());792break;793case TR::Symbol::Com_ibm_gpu_Kernel_threadIdxZ:794ir.print(" %%%d = call i32 @llvm.nvvm.read.ptx.sreg.tid.z()\n", node->getLocalIndex());795break;796case TR::Symbol::Com_ibm_gpu_Kernel_syncThreads:797ir.print(" call void @llvm.nvvm.barrier0()\n");798node->setLocalIndex(_gpuNodeCount--);799break;800default:801return false;802}803return true;804}805806void J9::CodeGenerator::printArrayCopyNVVMIR(TR::Node *node, NVVMIRBuffer &ir, TR::Compilation *comp)807{808//Some forms of array copy have five children. First two nodes are used for write barriers which we don't need809// Three child version810// child 0 ------ Source byte address;811// child 1 ------ Destination byte address;812// child 2 ------ Copy length in byte;813// Five child version:814// child 0 ------ Source array object; (skipped)815// child 1 ------ Destination array object; (skipped)816// child 2 ------ Source byte address;817// child 3 ------ Destination byte address;818// child 4 ------ Copy length in byte;819//childrenNodeOffset is set such that we access Source byte address, Destination byte address and Copy length.820int childrenNodeOffset = node->getNumChildren() == 5 ? 2 : 0;821822char name0[MAX_NAME], name1[MAX_NAME], name2[MAX_NAME];823getNodeName(node->getChild(0+childrenNodeOffset), name0, comp);824getNodeName(node->getChild(1+childrenNodeOffset), name1, comp);825getNodeName(node->getChild(2+childrenNodeOffset), name2, comp);826827int arrayCopyID = node->getLocalIndex();828bool isWordCopy = node->chkWordElementArrayCopy();829bool isHalfwordCopy = node->chkHalfWordElementArrayCopy();830bool unknownCopy = !(isWordCopy || isHalfwordCopy);831bool isBackwardsCopy = !node->isForwardArrayCopy();832bool is64bitCopyLength = (node->getChild(2+childrenNodeOffset)->getDataType() == TR::Int64);833834/* Example NVVM IR:835836; Inputs to the array copy that come from the children:837%8 = getelementptr inbounds i8* %7, i64 76 ; Source addr838%10 = getelementptr inbounds i8* %9, i64 76 ; Destination addr839%14 = mul i64 %13, 2 ; Copy Length in bytes840841; Generated ArrayCopy NVVM IR842; This is a reverse halfword array copy843br label %ArrayCopy15844ArrayCopy15:845%15 = ptrtoint i8* %8 to i64 ; Generated for reverse array copy.846%16 = ptrtoint i8* %10 to i64 ; Changes source and destination847%17 = add i64 %15, %14 ; to point to the end of the array848%18 = add i64 %16, %14 ; These lines are not generated849%19 = sub i64 %17, 2 ; for a forward array copy850%20 = sub i64 %18, 2 ;851%21 = inttoptr i64 %19 to i8* ;852%22 = inttoptr i64 %20 to i8* ;853br label %ArrayCopyHeader15854ArrayCopyHeader15:855%23 = phi i64 [ %14, %ArrayCopy15 ], [ %36, %ArrayCopyBody15 ] ; Phi nodes save a different value to the temp856%24 = phi i8* [ %21, %ArrayCopy15 ], [ %34, %ArrayCopyBody15 ] ; based on the name of the previous block before857%25 = phi i8* [ %22, %ArrayCopy15 ], [ %35, %ArrayCopyBody15 ] ; jumping to ArrayCopyHeader15858%26 = bitcast i8* %24 to i16*859%27 = bitcast i8* %25 to i16*860%28 = icmp sle i64 %23, 0861br i1 %28, label %AfterArrayCopy15, label %ArrayCopyBody15 ; branch to exit if no more work to do862ArrayCopyBody15:863%29 = load i16* %26 ; load data from input array864store i16 %29, i16* %27 ; store data to output array865%30 = ptrtoint i16* %26 to i64866%31 = ptrtoint i16* %27 to i64867%32 = sub i64 %30, 2 ; sub is used for reverse copy, add used for forward copy868%33 = sub i64 %31, 2 ; sub is used for reverse copy, add used for forward copy869%34 = inttoptr i64 %32 to i8*870%35 = inttoptr i64 %33 to i8*871%36 = sub i64 %23, 2 ; decrement copy length872br label %ArrayCopyHeader15873AfterArrayCopy15:874*/875876877//create a new block so the phi nodes know the name of the preceding block878ir.print(" br label %%ArrayCopy%d\n", arrayCopyID);879ir.print("ArrayCopy%d:\n", arrayCopyID);880881//for a backwards copy, the source and destination pointers need to be adjusted to882//point to the last element.883if (isBackwardsCopy)884{885if (!is64bitCopyLength)886{887ir.print(" %%%d = sext %s %s to i64\n",888node->getLocalIndex(),889getTypeName(node->getChild(2+childrenNodeOffset)->getDataType()),890name2);891node->setLocalIndex(_gpuNodeCount++);892}893894ir.print(" %%%d = ptrtoint %s %s to i64\n",895node->getLocalIndex(),896getTypeName(node->getChild(0+childrenNodeOffset)->getDataType()),897name0);898node->setLocalIndex(_gpuNodeCount++);899900ir.print(" %%%d = ptrtoint %s %s to i64\n",901node->getLocalIndex(),902getTypeName(node->getChild(1+childrenNodeOffset)->getDataType()),903name1);904node->setLocalIndex(_gpuNodeCount++);905906if (is64bitCopyLength)907{908ir.print(" %%%d = add i64 %%%d, %s\n",909node->getLocalIndex(),910node->getLocalIndex()-2,911name2);912node->setLocalIndex(_gpuNodeCount++);913914ir.print(" %%%d = add i64 %%%d, %s\n",915node->getLocalIndex(),916node->getLocalIndex()-2,917name2);918node->setLocalIndex(_gpuNodeCount++);919}920else921{922ir.print(" %%%d = add i64 %%%d, %%%d\n",923node->getLocalIndex(),924node->getLocalIndex()-2,925node->getLocalIndex()-3);926node->setLocalIndex(_gpuNodeCount++);927928ir.print(" %%%d = add i64 %%%d, %%%d\n",929node->getLocalIndex(),930node->getLocalIndex()-2,931node->getLocalIndex()-4);932node->setLocalIndex(_gpuNodeCount++);933}934935ir.print(" %%%d = sub i64 %%%d, %d\n",936node->getLocalIndex(),937node->getLocalIndex()-2,938isWordCopy ? 4 : isHalfwordCopy ? 2 : 1);939node->setLocalIndex(_gpuNodeCount++);940941ir.print(" %%%d = sub i64 %%%d, %d\n",942node->getLocalIndex(),943node->getLocalIndex()-2,944isWordCopy ? 4 : isHalfwordCopy ? 2 : 1);945node->setLocalIndex(_gpuNodeCount++);946947ir.print(" %%%d = inttoptr i64 %%%d to %s\n",948node->getLocalIndex(),949node->getLocalIndex()-2,950getTypeName(node->getChild(0+childrenNodeOffset)->getDataType()));951node->setLocalIndex(_gpuNodeCount++);952953ir.print(" %%%d = inttoptr i64 %%%d to %s\n",954node->getLocalIndex(),955node->getLocalIndex()-2,956getTypeName(node->getChild(1+childrenNodeOffset)->getDataType()));957node->setLocalIndex(_gpuNodeCount++);958}959960ir.print(" br label %%ArrayCopyHeader%d\n", arrayCopyID);961ir.print("ArrayCopyHeader%d:\n", arrayCopyID);962963//copy length in bytes964ir.print(" %%%d = phi %s [ %s, %%ArrayCopy%d ], [ %%%d, %%ArrayCopyBody%d ]\n",965node->getLocalIndex(),966getTypeName(node->getChild(2+childrenNodeOffset)->getDataType()),967name2,968arrayCopyID,969unknownCopy ? node->getLocalIndex()+11 : node->getLocalIndex()+13,970arrayCopyID);971node->setLocalIndex(_gpuNodeCount++);972973if (!isBackwardsCopy)974{975//source address976ir.print(" %%%d = phi %s [ %s, %%ArrayCopy%d ], [ %%%d, %%ArrayCopyBody%d ]\n",977node->getLocalIndex(),978getTypeName(node->getChild(0+childrenNodeOffset)->getDataType()),979name0,980arrayCopyID,981unknownCopy ? node->getLocalIndex()+8 : node->getLocalIndex()+10,982arrayCopyID);983node->setLocalIndex(_gpuNodeCount++);984985//destination address986ir.print(" %%%d = phi %s [ %s, %%ArrayCopy%d ], [ %%%d, %%ArrayCopyBody%d ]\n",987node->getLocalIndex(),988getTypeName(node->getChild(1+childrenNodeOffset)->getDataType()),989name1,990arrayCopyID,991unknownCopy ? node->getLocalIndex()+8 : node->getLocalIndex()+10,992arrayCopyID);993node->setLocalIndex(_gpuNodeCount++);994}995else996{997//source address998ir.print(" %%%d = phi %s [ %%%d, %%ArrayCopy%d ], [ %%%d, %%ArrayCopyBody%d ]\n",999node->getLocalIndex(),1000getTypeName(node->getChild(0+childrenNodeOffset)->getDataType()),1001node->getLocalIndex()-3,1002arrayCopyID,1003unknownCopy ? node->getLocalIndex()+8 : node->getLocalIndex()+10,1004arrayCopyID);1005node->setLocalIndex(_gpuNodeCount++);10061007//destination address1008ir.print(" %%%d = phi %s [ %%%d, %%ArrayCopy%d ], [ %%%d, %%ArrayCopyBody%d ]\n",1009node->getLocalIndex(),1010getTypeName(node->getChild(1+childrenNodeOffset)->getDataType()),1011node->getLocalIndex()-3,1012arrayCopyID,1013unknownCopy ? node->getLocalIndex()+8 : node->getLocalIndex()+10,1014arrayCopyID);1015node->setLocalIndex(_gpuNodeCount++);1016}10171018//change pointer types from i8* if copying halfword or word data1019if (isWordCopy || isHalfwordCopy)1020{1021ir.print(" %%%d = bitcast %s %%%d to %s\n",1022node->getLocalIndex(),1023getTypeName(node->getChild(0+childrenNodeOffset)->getDataType()),1024node->getLocalIndex()-2,1025isWordCopy ? "i32*" : "i16*");1026node->setLocalIndex(_gpuNodeCount++);10271028ir.print(" %%%d = bitcast %s %%%d to %s\n",1029node->getLocalIndex(),1030getTypeName(node->getChild(1+childrenNodeOffset)->getDataType()),1031node->getLocalIndex()-2,1032isWordCopy ? "i32*" : "i16*");1033node->setLocalIndex(_gpuNodeCount++);1034}10351036//check if byte length is less than or equal to zero and skip the copy if true1037ir.print(" %%%d = icmp sle %s %%%d, 0\n",1038node->getLocalIndex(),1039getTypeName(node->getChild(2+childrenNodeOffset)->getDataType()),1040unknownCopy ? node->getLocalIndex()-3 : node->getLocalIndex()-5);1041node->setLocalIndex(_gpuNodeCount++);10421043ir.print(" br i1 %%%d, label %%AfterArrayCopy%d, label %%ArrayCopyBody%d\n",1044node->getLocalIndex()-1,1045arrayCopyID,1046arrayCopyID);10471048ir.print("ArrayCopyBody%d:\n", arrayCopyID);10491050//load data to copy1051ir.print(" %%%d = load %s %%%d\n",1052node->getLocalIndex(),1053isWordCopy ? "i32*" : isHalfwordCopy ? "i16*" : getTypeName(node->getChild(0+childrenNodeOffset)->getDataType()),1054node->getLocalIndex()-3);1055node->setLocalIndex(_gpuNodeCount++);10561057//store loaded data1058ir.print(" store %s %%%d, %s %%%d\n",1059isWordCopy ? "i32" : isHalfwordCopy ? "i16" : "i8",1060node->getLocalIndex()-1,1061isWordCopy ? "i32*" : isHalfwordCopy ? "i16*" : getTypeName(node->getChild(1+childrenNodeOffset)->getDataType()),1062node->getLocalIndex()-3);10631064ir.print(" %%%d = ptrtoint %s %%%d to i64\n",1065node->getLocalIndex(),1066isWordCopy ? "i32*" : isHalfwordCopy ? "i16*" : getTypeName(node->getChild(0+childrenNodeOffset)->getDataType()),1067node->getLocalIndex()-4);1068node->setLocalIndex(_gpuNodeCount++);10691070ir.print(" %%%d = ptrtoint %s %%%d to i64\n",1071node->getLocalIndex(),1072isWordCopy ? "i32*" : isHalfwordCopy ? "i16*" : getTypeName(node->getChild(1+childrenNodeOffset)->getDataType()),1073node->getLocalIndex()-4);1074node->setLocalIndex(_gpuNodeCount++);10751076//move source pointer1077ir.print(" %%%d = %s i64 %%%d, %d\n",1078node->getLocalIndex(),1079isBackwardsCopy ? "sub" : "add",1080node->getLocalIndex()-2,1081isWordCopy ? 4 : isHalfwordCopy ? 2 : 1);1082node->setLocalIndex(_gpuNodeCount++);10831084//move destination pointer1085ir.print(" %%%d = %s i64 %%%d, %d\n",1086node->getLocalIndex(),1087isBackwardsCopy ? "sub" : "add",1088node->getLocalIndex()-2,1089isWordCopy ? 4 : isHalfwordCopy ? 2 : 1);1090node->setLocalIndex(_gpuNodeCount++);10911092ir.print(" %%%d = inttoptr i64 %%%d to %s\n",1093node->getLocalIndex(),1094node->getLocalIndex()-2,1095getTypeName(node->getChild(0+childrenNodeOffset)->getDataType()));1096node->setLocalIndex(_gpuNodeCount++);10971098ir.print(" %%%d = inttoptr i64 %%%d to %s\n",1099node->getLocalIndex(),1100node->getLocalIndex()-2,1101getTypeName(node->getChild(1+childrenNodeOffset)->getDataType()));1102node->setLocalIndex(_gpuNodeCount++);11031104//decrement copy length1105ir.print(" %%%d = sub %s %%%d, %d\n",1106node->getLocalIndex(),1107getTypeName(node->getChild(2+childrenNodeOffset)->getDataType()),1108unknownCopy ? node->getLocalIndex()-11 : node->getLocalIndex()-13,1109isWordCopy ? 4 : isHalfwordCopy ? 2 : 1);11101111ir.print(" br label %%ArrayCopyHeader%d\n", arrayCopyID);1112ir.print("AfterArrayCopy%d:\n", arrayCopyID);1113}11141115bool isThisPointer(TR::SymbolReference * symRef)1116{1117return symRef->getSymbol()->isParm() &&1118((TR::ParameterSymbol *)symRef->getSymbol())->getSlot() == 0;1119}11201121char * getTypeNameFromSignature(char* sig, int32_t sigLength)1122{1123TR_ASSERT(sigLength == 2 && sig[0] == '[', "only handling static shared arrays");1124switch (sig[1])1125{1126case 'Z': return "i8";1127case 'B': return "i8";1128case 'C': return "i16";1129case 'S': return "i16";1130case 'I': return "i32";1131case 'J': return "i64";1132case 'F': return "float";1133case 'D': return "double";1134}1135TR_ASSERT(false, "unsupported shared array type\n");1136return NULL;1137}11381139static bool isSharedMemory(TR::Node *node, TR_SharedMemoryAnnotations *sharedMemory, TR::Compilation *comp)1140{1141if (!comp->isGPUCompilation()) return false;11421143TR::SymbolReference *symRef = node->getSymbolReference();1144if (!symRef->getSymbol()->isAutoOrParm() && symRef->getCPIndex() != -1)1145{1146TR_SharedMemoryField field = sharedMemory->find(comp, symRef);1147if (field.getSize() > 0) return true;1148}1149return false;1150}115111521153TR::CodeGenerator::GPUResult1154J9::CodeGenerator::printNVVMIR(1155NVVMIRBuffer &ir,1156TR::Node * node,1157TR_RegionStructure *loop,1158TR_BitVector *targetBlocks,1159vcount_t visitCount,1160TR_SharedMemoryAnnotations *sharedMemory,1161int32_t &nextParmNum,1162TR::Node * &errorNode)1163{1164GPUResult result;11651166static bool enableExceptionChecks = (feGetEnv("TR_disableGPUExceptionCheck") == NULL);1167TR::ILOpCode opcode = node->getOpCode();11681169char name0[MAX_NAME], name1[MAX_NAME];1170bool isGenerated = false;1171bool printChildrenWithRefCount1 = true;11721173if (node->isProfilingCode())1174{1175// Nothing to generate for profiling code, but we still need to visit the children1176// We can skip over the children with a reference count of one since they aren't used anywhere else.1177isGenerated = true;1178printChildrenWithRefCount1 = false;1179}11801181if (node->getOpCodeValue() == TR::compressedRefs)1182{1183if (loop->isExprInvariant(node))1184return GPUSuccess; // ignore for now1185node = node->getFirstChild();1186}11871188if (node->getOpCodeValue() == TR::treetop)1189node = node->getFirstChild();11901191if (self()->comp()->isGPUCompilation() &&1192opcode.isLoadVarDirect() &&1193isThisPointer(node->getSymbolReference()))1194return GPUSuccess; // will handle in the parent11951196if (opcode.isLoadConst())1197{1198if((node->getDataType() == TR::Address) && (node->getAddress() != 0))1199{1200traceMsg(self()->comp(), "Load Const with a non-zero address in node %p\n", node);1201return GPUInvalidProgram;1202}1203else1204{1205return GPUSuccess; // will handle in the parent1206}1207}12081209if (node->getOpCodeValue() == TR::asynccheck)1210return GPUSuccess;12111212if (!enableExceptionChecks &&1213(opcode.isNullCheck() || opcode.isBndCheck() || node->getOpCodeValue() == TR::DIVCHK))1214return GPUSuccess;121512161217if (node->getVisitCount() == visitCount)1218return GPUSuccess;12191220node->setVisitCount(visitCount);12211222if (opcode.isNullCheck())1223{12241225TR::Node *refNode = node->getNullCheckReference();12261227if (isSharedMemory(refNode, sharedMemory, self()->comp()))1228{1229// Shared Memory is always allocated1230ir.print("; DELETE NULLCHK [%p] since this reference [%p] is allocated in shared memory\n",1231node, refNode);1232return GPUSuccess;1233}1234if (_gpuPostDominators && _gpuPostDominators->dominates(_gpuCurrentBlock, _gpuStartBlock))1235{1236TR::SymbolReference *symRef = refNode->getSymbolReference();1237if (symRef->getSymbol()->isParm())1238{1239int32_t argpos = symRef->getCPIndex();1240ir.print("; DELETE NULLCHK [%p] (ref[%p] is parm %d) since BB[%d] postdominates BB[%d]\n",1241node, refNode, argpos, _gpuCurrentBlock->getNumber(), _gpuStartBlock->getNumber());1242_gpuNeedNullCheckArguments_vector |= (1L << (uint64_t)argpos);1243return GPUSuccess;1244}1245}12461247result = self()->printNVVMIR(ir, refNode, loop, targetBlocks, visitCount, sharedMemory, nextParmNum, errorNode);1248if (result != GPUSuccess) return result;12491250getNodeName(refNode, name0, self()->comp());1251const char *type0 = getTypeName(refNode->getDataType());12521253node->setLocalIndex(_gpuNodeCount++);12541255ir.print(" %%%d = icmp eq %s %s, null\n",1256node->getLocalIndex(), type0, name0, name1);1257ir.print(" br i1 %%%d, label %%NullException, label %%nullchk_fallthru_%d, !prof !0\n",1258node->getLocalIndex(), node->getLocalIndex());1259ir.print("nullchk_fallthru_%d:\n", node->getLocalIndex());12601261_gpuHasNullCheck = true;1262isGenerated = true;1263}1264else if (opcode.isBndCheck())1265{1266bool isSMReference = false;1267int32_t smsize = -1;1268if (node->getChild(0)->getOpCodeValue() == TR::arraylength)1269{1270TR::Node *refNode = node->getChild(0)->getChild(0);1271if (isSharedMemory(refNode, sharedMemory, self()->comp()))1272{1273TR_SharedMemoryField field = sharedMemory->find(self()->comp(), refNode->getSymbolReference());1274smsize = field.getSize();1275TR_ASSERT(smsize > 0, "should be annotated as shared array with positive size");12761277isSMReference = true;1278ir.print("; USE CONSTANT LENGTH %d for NULLCHK [%p] since this reference [%p] is allocated in shared memory\n",1279smsize, node, refNode);1280}1281}12821283if (!isSMReference)1284{1285result = self()->printNVVMIR(ir, node->getChild(0), loop, targetBlocks, visitCount, sharedMemory, nextParmNum, errorNode);1286if (result != GPUSuccess) return result;1287}1288result = self()->printNVVMIR(ir, node->getChild(1), loop, targetBlocks, visitCount, sharedMemory, nextParmNum, errorNode);1289if (result != GPUSuccess) return result;12901291if (!isSMReference)1292{1293getNodeName(node->getChild(0), name0, self()->comp());1294}1295else1296{1297TR::snprintfNoTrunc(name0, MAX_NAME, "%" OMR_PRId32, smsize);1298}12991300getNodeName(node->getChild(1), name1, self()->comp());1301const char *type0 = getTypeName(node->getChild(0)->getDataType());13021303node->setLocalIndex(_gpuNodeCount++);13041305ir.print(" %%%d = icmp ule %s %s, %s\n",1306node->getLocalIndex(), type0, name0, name1);1307ir.print(" br i1 %%%d, label %%BndException, label %%bndchk_fallthru_%d, !prof !0\n",1308node->getLocalIndex(), node->getLocalIndex());1309ir.print("bndchk_fallthru_%d:\n", node->getLocalIndex());13101311_gpuHasBndCheck = true;1312isGenerated = true;1313}1314else if (node->getOpCodeValue() == TR::DIVCHK)1315{1316TR::Node *idivNode = node->getChild(0);1317result = self()->printNVVMIR(ir, idivNode->getChild(0), loop, targetBlocks, visitCount, sharedMemory, nextParmNum, errorNode);1318if (result != GPUSuccess) return result;13191320result = self()->printNVVMIR(ir, idivNode->getChild(1), loop, targetBlocks, visitCount, sharedMemory, nextParmNum, errorNode);1321if (result != GPUSuccess) return result;13221323getNodeName(idivNode->getChild(1), name0, self()->comp());1324const char *type0 = getTypeName(idivNode->getChild(1)->getDataType());13251326node->setLocalIndex(_gpuNodeCount++);13271328ir.print(" %%%d = icmp eq %s %s, 0\n",1329node->getLocalIndex(), type0, name0);1330ir.print(" br i1 %%%d, label %%DivException, label %%divchk_fallthru_%d, !prof !0\n",1331node->getLocalIndex(), node->getLocalIndex());1332ir.print("divchk_fallthru_%d:\n", node->getLocalIndex());13331334_gpuHasDivCheck = true;1335isGenerated = true;1336}13371338// This symbol reference should become a parameter1339// children should be skipped (they are loop invariant)1340if (node->getOpCode().isLoadVar() &&1341_gpuSymbolMap[node->getSymbolReference()->getReferenceNumber()]._parmSlot != -1)1342{1343getParmName(_gpuSymbolMap[node->getSymbolReference()->getReferenceNumber()]._parmSlot, name0);13441345node->setLocalIndex(_gpuNodeCount++);1346traceMsg(self()->comp(), "node %p assigned index %d\n", node, node->getLocalIndex());13471348ir.print(" %%%d = %s %s* %s, align %d\n",1349node->getLocalIndex(),1350getOpCodeName(node->getOpCodeValue()),1351getTypeName(node->getDataType()),1352name0,1353node->getSize());1354return GPUSuccess;1355}13561357//Don't run printNVVMIR on a children node if:1358//(they are the child of a profiling call) AND ((have a reference count less then two) OR (is a loadConst node))1359for (int32_t i = 0; i < node->getNumChildren(); ++i)1360{1361TR::Node *child = node->getChild(i);1362if ((child->getReferenceCount() >= 2 && !child->getOpCode().isLoadConst()) || printChildrenWithRefCount1)1363{1364result = self()->printNVVMIR(ir, child, loop, targetBlocks, visitCount, sharedMemory, nextParmNum, errorNode);1365if (result != GPUSuccess)1366return result;1367}1368}13691370if (isGenerated)1371{1372return GPUSuccess;1373}13741375node->setLocalIndex(_gpuNodeCount++);1376traceMsg(self()->comp(), "node %p assigned index %d\n", node, node->getLocalIndex());137713781379if (node->getOpCodeValue() == TR::PassThrough)1380{1381node->setLocalIndex(_gpuNodeCount--);1382return GPUSuccess;1383}1384else if (node->getOpCodeValue() == TR::BBStart)1385{1386node->setLocalIndex(_gpuNodeCount--);1387_gpuCurrentBlock = node->getBlock();1388if (targetBlocks->get(_gpuCurrentBlock->getNumber()))1389ir.print("block_%d:\n", _gpuCurrentBlock->getNumber());1390}1391// if block has a label previous block has to end with a branch1392else if (node->getOpCodeValue() == TR::BBEnd &&1393!_gpuCurrentBlock->endsInBranch() &&1394!_gpuCurrentBlock->getLastRealTreeTop()->getNode()->getOpCode().isReturn() &&1395!_gpuCurrentBlock->getLastRealTreeTop()->getNode()->getOpCode().isGoto() &&1396!_gpuCurrentBlock->getLastRealTreeTop()->getNode()->getOpCode().isSwitch() &&1397_gpuCurrentBlock->getNextBlock() &&1398targetBlocks->get(_gpuCurrentBlock->getNextBlock()->getNumber()))1399{1400node->setLocalIndex(_gpuNodeCount--);1401ir.print(" br label %%block_%d\n",1402_gpuCurrentBlock->getNextBlock()->getNumber());1403}1404else if (node->getOpCodeValue() == TR::BBEnd)1405{1406node->setLocalIndex(_gpuNodeCount--);1407}1408else if (node->getOpCode().isReturn())1409{1410node->setLocalIndex(_gpuNodeCount--);14111412if (node->getNumChildren() == 0)1413{1414ir.print(" %s void\n",1415getOpCodeName(node->getOpCodeValue()));1416}1417else1418{1419TR_ASSERT(node->getNumChildren() == 1, "Unsupported return\n");1420getNodeName(node->getChild(0), name0, self()->comp());1421ir.print(" %s %s %s\n",1422getOpCodeName(node->getOpCodeValue()),1423getTypeName(node->getDataType()),1424name0);1425}1426}1427else if (node->getOpCode().isStoreIndirect()) // TODO: handle statics1428{1429TR::Node *firstChild = node->getChild(0);1430TR::Node *secondChild = node->getChild(1);1431getNodeName(firstChild, name0, self()->comp());1432getNodeName(secondChild, name1, self()->comp());14331434{1435_gpuNodeCount--;1436ir.print(" %%%d = bitcast %s %s to %s %s*\n",1437_gpuNodeCount,1438getTypeName(firstChild->getDataType()), name0,1439getTypeName(secondChild->getDataType()),1440firstChild->chkSharedMemory() ? "addrspace(3)" : "");14411442ir.print(" %s %s %s, %s %s * %%%d, align %d\n",1443getOpCodeName(node->getOpCodeValue()),1444getTypeName(secondChild->getDataType()),1445name1,1446getTypeName(secondChild->getDataType()),1447firstChild->chkSharedMemory() ? "addrspace(3)" : "",1448_gpuNodeCount,1449secondChild->getSize());1450_gpuNodeCount++;1451}1452}1453else if (node->getOpCode().isLoadIndirect()) // TODO: handle statics1454{1455TR::Node *firstChild = node->getChild(0);14561457getNodeName(firstChild, name0, self()->comp());14581459if (node->getSymbolReference()->getCPIndex() != -1) // field of some object1460{1461// TODO: check that field is an array!1462TR_ASSERT(firstChild->getOpCode().isLoadDirect() &&1463isThisPointer(firstChild->getSymbolReference()),1464"can only access a field of this object\n");14651466// TODO: handle duplicate names from different classes1467TR_SharedMemoryField field = sharedMemory->find(self()->comp(), node->getSymbolReference());1468TR_ASSERT(field.getSize() >= 0, "field was not found in this object\n");14691470if (field.getSize() > 0)1471{1472ir.print(" %%%d = bitcast [%d x %s] addrspace(3)* @%.*s to i8*\n",1473node->getLocalIndex(),1474field.getSize(),1475getTypeNameFromSignature(field.getFieldSig(), field.getFieldSigLength()),1476field.getFieldNameLength(), field.getFieldName());14771478node->setSharedMemory(true);1479}1480else1481{1482int32_t parmNum = field.getParmNum();1483if (parmNum == -1)1484{1485sharedMemory->setParmNum(self()->comp(), node->getSymbolReference(), nextParmNum);1486parmNum = nextParmNum++;1487}1488ir.print(" %%%d = %s %s* %%p%d.addr, align %d\n",1489node->getLocalIndex(),1490getOpCodeName(node->getOpCodeValue()),1491getTypeName(node->getDataType()),1492parmNum,1493node->getSize());1494}1495}1496else1497{1498// assume SM35 or more1499static bool disableReadOnlyCacheArray = (feGetEnv("TR_disableGPUReadOnlyCacheArray") != NULL);1500bool isReadOnlyArray = false;1501if (node->getSymbolReference()->getSymbol()->isArrayShadowSymbol() &&1502// I disabled to generate ld.global.nc for read-only address array1503// I do not know an intrinsic function name for ld.global.nc of address1504node->getDataType() != TR::Address &&1505!disableReadOnlyCacheArray &&1506_gpuCanUseReadOnlyCache)1507{1508TR::Node *addrNode = node->getFirstChild();1509if (addrNode->getOpCodeValue() == TR::aiadd || addrNode->getOpCodeValue() == TR::aladd)1510{1511addrNode = addrNode->getFirstChild();1512}1513if ((addrNode->getOpCodeValue() == TR::aload) || (addrNode->getOpCodeValue() == TR::aloadi))1514{1515TR::SymbolReference *symRef = addrNode->getSymbolReference();1516int32_t symRefIndex = symRef->getReferenceNumber();1517CS2::ArrayOf<gpuMapElement, TR::Allocator> &gpuSymbolMap = self()->comp()->cg()->_gpuSymbolMap;15181519int32_t nc = symRefIndex;1520TR::SymbolReference *hostSymRef = gpuSymbolMap[nc]._hostSymRef;1521int32_t parmSlot = gpuSymbolMap[nc]._parmSlot;15221523if (!hostSymRef || parmSlot == -1)1524{1525TR::Node *tempNode = gpuSymbolMap[nc]._node;1526if (tempNode && (tempNode->getOpCodeValue() == TR::astore) && (tempNode->getFirstChild()->getOpCodeValue() == TR::aloadi))1527{1528TR::Node *parmNode = tempNode->getFirstChild();1529nc = parmNode->getSymbolReference()->getReferenceNumber();15301531hostSymRef = gpuSymbolMap[nc]._hostSymRef;1532parmSlot = gpuSymbolMap[nc]._parmSlot;1533}1534}1535else if (hostSymRef->getReferenceNumber() != symRefIndex)1536{1537hostSymRef = NULL;1538}15391540if (hostSymRef && (parmSlot != -1) &&1541(gpuSymbolMap[nc]._accessKind & TR::CodeGenerator::ReadWriteAccesses) == TR::CodeGenerator::ReadAccess)1542{1543isReadOnlyArray = true;1544}1545}1546}15471548ir.print(" %%%d = bitcast %s %s to %s %s*\n",1549_gpuNodeCount-1,1550getTypeName(firstChild->getDataType()),1551name0,1552getTypeName(node->getDataType()),1553firstChild->chkSharedMemory() ? "addrspace(3)" : isReadOnlyArray ? "addrspace(1)" : "");15541555node->setLocalIndex(_gpuNodeCount++);1556traceMsg(self()->comp(), "node %p assigned index %d\n", node, node->getLocalIndex());15571558//NVVM 1.3 onward uses a two parameter version of ldg1559if (isReadOnlyArray)1560{1561if (_gpuUseOldLdgCalls)1562{1563ir.print(" %%%d = tail call %s @llvm.nvvm.ldg.global.%s.%s.p1%s(%s addrspace(1)* %%%d), !align !1%d\n",1564node->getLocalIndex(),1565getTypeName(node->getDataType()),1566(node->getDataType() >= TR::Float) ? "f" : "i",1567getVarTypeName(node->getDataType()),1568getVarTypeName(node->getDataType()),1569getTypeName(node->getDataType()),1570_gpuNodeCount-2,1571node->getSize());1572}1573else1574{1575ir.print(" %%%d = tail call %s @llvm.nvvm.ldg.global.%s.%s.p1%s(%s addrspace(1)* %%%d, i32 %d)\n",1576node->getLocalIndex(),1577getTypeName(node->getDataType()),1578(node->getDataType() >= TR::Float) ? "f" : "i",1579getVarTypeName(node->getDataType()),1580getVarTypeName(node->getDataType()),1581getTypeName(node->getDataType()),1582_gpuNodeCount-2,1583node->getSize());1584}1585}1586else1587// e.g. %32 = load i32 addrspace(4) * %31, align 41588ir.print(" %%%d = %s %s %s * %%%d, align %d\n",1589node->getLocalIndex(),1590getOpCodeName(node->getOpCodeValue()),1591getTypeName(node->getDataType()),1592firstChild->chkSharedMemory() ? "addrspace(3)" : "",1593_gpuNodeCount-2,1594node->getSize());1595}1596}1597else if (node->getOpCode().isCall() &&1598((TR::MethodSymbol*)node->getSymbolReference()->getSymbol())->getRecognizedMethod() != TR::unknownMethod &&1599self()->handleRecognizedMethod(node, ir, self()->comp()))1600{1601}1602else if (node->getOpCodeValue() == TR::arraycopy)1603{1604self()->printArrayCopyNVVMIR(node, ir, self()->comp());1605}1606else if (node->getOpCode().isCall())1607{1608traceMsg(self()->comp(), "unrecognized call %p\n", node);1609return GPUInvalidProgram;1610}1611else if (node->getOpCode().isStoreDirect() &&1612node->getSymbolReference()->getSymbol()->getRecognizedField() != TR::Symbol::UnknownField)1613{1614switch (node->getSymbolReference()->getSymbol()->getRecognizedField())1615{1616case TR::Symbol::Com_ibm_gpu_Kernel_syncThreads:1617ir.print(" call void @llvm.nvvm.barrier0()\n");1618break;1619default:1620break;1621}1622node->setLocalIndex(_gpuNodeCount--);1623}1624else if (node->getOpCode().isLoadVarDirect() &&1625node->getSymbolReference()->getSymbol()->getRecognizedField() != TR::Symbol::UnknownField &&1626self()->handleRecognizedField(node, ir))1627{1628}1629else if (node->getOpCode().isLoadVarDirect())1630{1631if (!node->getSymbol()->isAutoOrParm())1632{1633traceMsg(self()->comp(), "unexpected symbol in node %p\n");1634return GPUInvalidProgram;1635}16361637getAutoOrParmName(node->getSymbol(), name0);16381639ir.print(" %%%d = %s %s* %s, align %d\n",1640node->getLocalIndex(),1641getOpCodeName(node->getOpCodeValue()),1642getTypeName(node->getDataType()),1643name0,1644node->getSize());1645}1646else if (node->getOpCode().isStoreDirect())1647{1648if (!node->getSymbol()->isAutoOrParm())1649{1650traceMsg(self()->comp(), "unexpected symbol in node %p\n");1651return GPUInvalidProgram;1652}16531654getNodeName(node->getChild(0), name0, self()->comp());1655getAutoOrParmName(node->getSymbol(), name1);16561657ir.print(" %s %s %s, %s* %s, align %d\n",1658getOpCodeName(node->getOpCodeValue()),1659getTypeName(node->getChild(0)->getDataType()),1660name0,1661getTypeName(node->getDataType()),1662name1,1663node->getChild(0)->getSize());16641665node->setLocalIndex(_gpuNodeCount--);1666}1667else if (node->getOpCode().isArrayRef())1668{1669getNodeName(node->getChild(0), name0, self()->comp());1670getNodeName(node->getChild(1), name1, self()->comp());16711672ir.print(" %%%d = %s inbounds %s %s, %s %s\n",1673node->getLocalIndex(),1674getOpCodeName(node->getOpCodeValue()),1675getTypeName(node->getChild(0)->getDataType()),1676name0,1677getTypeName(node->getChild(1)->getDataType()), name1);16781679if (node->getChild(0)->chkSharedMemory())1680node->setSharedMemory(true);1681}1682else if (node->getOpCodeValue() == TR::arraylength)1683{1684// assume SM35 or more1685static bool disableReadOnlyCacheObjHdr = (feGetEnv("TR_disableGPUReadOnlyCacheObjHdr") != NULL);1686getNodeName(node->getChild(0), name0, self()->comp());16871688ir.print(" %%%d = getelementptr inbounds i8* %s, i32 %d\n",1689node->getLocalIndex(),1690name0,1691self()->objectLengthOffset());16921693node->setLocalIndex(_gpuNodeCount++);16941695ir.print(" %%%d = bitcast i8* %%%d to i32 %s*\n",1696node->getLocalIndex(),1697node->getLocalIndex() - 1,1698(_gpuCanUseReadOnlyCache && !disableReadOnlyCacheObjHdr) ? "addrspace(1)" : "");16991700node->setLocalIndex(_gpuNodeCount++);17011702//NVVM 1.3 onward uses a two parameter version of ldg1703if (_gpuCanUseReadOnlyCache && !disableReadOnlyCacheObjHdr)1704{1705if (_gpuUseOldLdgCalls)1706{1707ir.print(" %%%d = tail call i32 @llvm.nvvm.ldg.global.i.i32.p1i32(i32 addrspace(1)* %%%d), !align !14\n",1708node->getLocalIndex(),1709node->getLocalIndex() - 1);1710}1711else1712{1713ir.print(" %%%d = tail call i32 @llvm.nvvm.ldg.global.i.i32.p1i32(i32 addrspace(1)* %%%d, i32 4)\n",1714node->getLocalIndex(),1715node->getLocalIndex() - 1);1716}1717}1718else1719ir.print(" %%%d = load i32* %%%d, align 4\n",1720node->getLocalIndex(),1721node->getLocalIndex() - 1);1722}1723// Binary Operations1724else if ((node->getOpCodeValue() == TR::lshl ||1725node->getOpCodeValue() == TR::lshr) &&1726node->getChild(1)->getDataType() == TR::Int32)1727{1728getNodeName(node->getChild(0), name0, self()->comp());1729getNodeName(node->getChild(1), name1, self()->comp());17301731ir.print(" %%%d = sext i32 %s to i64\n",1732node->getLocalIndex(),1733name1);1734node->setLocalIndex(_gpuNodeCount++);17351736ir.print(" %%%d = %s %s %s, %%%d\n",1737node->getLocalIndex(),1738getOpCodeName(node->getOpCodeValue()),1739getTypeName(node->getDataType()),1740name0, _gpuNodeCount-2);17411742}1743else if (node->getOpCodeValue() == TR::imulh || node->getOpCodeValue() == TR::iumulh ||1744node->getOpCodeValue() == TR::lmulh || node->getOpCodeValue() == TR::lumulh)1745{1746getNodeName(node->getChild(0), name0, self()->comp());1747getNodeName(node->getChild(1), name1, self()->comp());17481749bool isLongMul = node->getOpCodeValue() == TR::lmulh || node->getOpCodeValue() == TR::lumulh;1750bool isSignedMul = node->getOpCodeValue() == TR::imulh || node->getOpCodeValue() == TR::lmulh;17511752bool extendChild0 = isLongMul || (node->getChild(0)->getDataType() != TR::Int64);1753bool extendChild1 = isLongMul || (node->getChild(1)->getDataType() != TR::Int64);17541755if (extendChild0)1756{1757ir.print(" %%%d = %s %s %s to %s\n",1758node->getLocalIndex(),1759isSignedMul ? "sext" : "zext",1760getTypeName(node->getChild(0)->getDataType()),1761name0,1762isLongMul ? "i128" : "i64");1763node->setLocalIndex(_gpuNodeCount++);1764}1765else1766{1767ir.print(" %%%d = lshr i64 %s, 0\n",1768node->getLocalIndex(),1769name0);1770node->setLocalIndex(_gpuNodeCount++);1771}17721773if(extendChild1)1774{1775ir.print(" %%%d = %s %s %s to %s\n",1776node->getLocalIndex(),1777isSignedMul ? "sext" : "zext",1778getTypeName(node->getChild(1)->getDataType()),1779name1,1780isLongMul ? "i128" : "i64");1781node->setLocalIndex(_gpuNodeCount++);1782}1783else1784{1785ir.print(" %%%d = lshr i64 %s, 0\n",1786node->getLocalIndex(),1787name1);1788node->setLocalIndex(_gpuNodeCount++);1789}17901791ir.print(" %%%d = mul %s %%%d, %%%d\n",1792node->getLocalIndex(),1793isLongMul ? "i128" : "i64",1794node->getLocalIndex()-2,1795node->getLocalIndex()-1);1796node->setLocalIndex(_gpuNodeCount++);17971798ir.print(" %%%d = lshr %s %%%d, %s\n",1799node->getLocalIndex(),1800isLongMul ? "i128" : "i64",1801node->getLocalIndex()-1,1802isLongMul ? "64" : "32");1803node->setLocalIndex(_gpuNodeCount++);18041805ir.print(" %%%d = trunc %s %%%d to %s\n",1806node->getLocalIndex(),1807isLongMul ? "i128" : "i64",1808node->getLocalIndex()-1,1809isLongMul ? "i64" : "i32");1810}1811else if (node->getOpCodeValue() == TR::bneg || node->getOpCodeValue() == TR::sneg ||1812node->getOpCodeValue() == TR::ineg || node->getOpCodeValue() == TR::lneg ||1813node->getOpCodeValue() == TR::fneg || node->getOpCodeValue() == TR::dneg)1814{1815getNodeName(node->getChild(0), name0, self()->comp());18161817bool isFloatDouble = node->getOpCodeValue() == TR::fneg || node->getOpCodeValue() == TR::dneg;18181819ir.print(" %%%d = %s %s %s, %s\n",1820node->getLocalIndex(),1821getOpCodeName(node->getOpCodeValue()),1822getTypeName(node->getDataType()),1823isFloatDouble ? "0.0" : "0",1824name0);1825}1826else if (node->getOpCodeValue() == TR::iabs || node->getOpCodeValue() == TR::labs)1827{1828getNodeName(node->getChild(0), name0, self()->comp());18291830bool isInt = node->getOpCodeValue() == TR::iabs;18311832ir.print(" %%%d = ashr %s %s, %s\n",1833node->getLocalIndex(),1834getTypeName(node->getDataType()),1835name0,1836isInt ? "31" : "63");1837node->setLocalIndex(_gpuNodeCount++);18381839ir.print(" %%%d = xor %s %s, %%%d\n",1840node->getLocalIndex(),1841getTypeName(node->getDataType()),1842name0,1843node->getLocalIndex()-1);1844node->setLocalIndex(_gpuNodeCount++);18451846ir.print(" %%%d = sub %s %%%d, %%%d\n",1847node->getLocalIndex(),1848getTypeName(node->getDataType()),1849node->getLocalIndex()-1,1850node->getLocalIndex()-2);1851}1852else if (node->getOpCodeValue() == TR::irol || node->getOpCodeValue() == TR::lrol)1853{1854getNodeName(node->getChild(0), name0, self()->comp());1855getNodeName(node->getChild(1), name1, self()->comp());18561857bool isInt = node->getOpCodeValue() == TR::irol;18581859ir.print(" %%%d = shl %s %s, %s\n",1860node->getLocalIndex(),1861getTypeName(node->getDataType()),1862name0, name1);1863node->setLocalIndex(_gpuNodeCount++);18641865ir.print(" %%%d = sub %s %s, %s\n",1866node->getLocalIndex(),1867getTypeName(node->getChild(1)->getDataType()),1868isInt ? "32" : "64",1869name1);1870node->setLocalIndex(_gpuNodeCount++);18711872ir.print(" %%%d = and %s %%%d, %s\n",1873node->getLocalIndex(),1874getTypeName(node->getChild(1)->getDataType()),1875node->getLocalIndex()-1,1876isInt ? "31" : "63");1877node->setLocalIndex(_gpuNodeCount++);18781879ir.print(" %%%d = lshr %s %s, %%%d\n",1880node->getLocalIndex(),1881getTypeName(node->getDataType()),1882name0,1883node->getLocalIndex()-1);1884node->setLocalIndex(_gpuNodeCount++);18851886ir.print(" %%%d = or %s %%%d, %%%d\n",1887node->getLocalIndex(),1888getTypeName(node->getDataType()),1889node->getLocalIndex()-4,1890node->getLocalIndex()-1);1891}1892else if (node->getOpCodeValue() == TR::ibits2f || node->getOpCodeValue() == TR::fbits2i ||1893node->getOpCodeValue() == TR::lbits2d || node->getOpCodeValue() == TR::dbits2l)1894{1895getNodeName(node->getChild(0), name0, self()->comp());18961897ir.print(" %%%d = %s %s %s to %s\n",1898node->getLocalIndex(),1899getOpCodeName(node->getOpCodeValue()),1900getTypeName(node->getChild(0)->getDataType()),1901name0,1902getTypeName(node->getDataType()));1903}1904else if (node->getOpCode().isArithmetic() &&1905node->getNumChildren() == 2 &&1906getOpCodeName(node->getOpCodeValue())) // supported binary opcode1907{1908getNodeName(node->getChild(0), name0, self()->comp());1909getNodeName(node->getChild(1), name1, self()->comp());19101911ir.print(" %%%d = %s %s %s, %s\n",1912node->getLocalIndex(),1913getOpCodeName(node->getOpCodeValue()),1914getTypeName(node->getDataType()),1915name0, name1);1916}1917else if (node->getOpCode().isConversion() &&1918getOpCodeName(node->getOpCodeValue()))1919{1920getNodeName(node->getChild(0), name0, self()->comp());19211922ir.print(" %%%d = %s %s %s to %s\n",1923node->getLocalIndex(),1924getOpCodeName(node->getOpCodeValue()),1925getTypeName(node->getChild(0)->getDataType()),1926name0,1927getTypeName(node->getDataType()));1928}1929else if (node->getOpCode().isIf())1930{1931getNodeName(node->getChild(0), name0, self()->comp());1932getNodeName(node->getChild(1), name1, self()->comp());1933const char *type0 = getTypeName(node->getChild(0)->getDataType());19341935const char *opcode = getOpCodeName(node->getOpCodeValue());19361937ir.print(" %%%d = %s %s %s, %s\n",1938node->getLocalIndex(), opcode, type0, name0, name1);1939ir.print(" br i1 %%%d, label %%block_%d, label %%block_%d\n",1940node->getLocalIndex(),1941node->getBranchDestination()->getNode()->getBlock()->getNumber(),1942_gpuCurrentBlock->getNextBlock()->getNumber());1943}1944else if (node->getOpCodeValue() == TR::Goto)1945{1946ir.print(" %s label %%block_%d\n",1947getOpCodeName(node->getOpCodeValue()),1948node->getBranchDestination()->getNode()->getBlock()->getNumber());1949node->setLocalIndex(_gpuNodeCount--);1950}1951else if (node->getOpCodeValue() == TR::lookup)1952{1953getNodeName(node->getChild(0), name0, self()->comp());19541955ir.print(" %s %s %s, label %%block_%d [ ",1956getOpCodeName(node->getOpCodeValue()),1957getTypeName(node->getChild(0)->getDataType()),1958name0,1959node->getChild(1)->getBranchDestination()->getNode()->getBlock()->getNumber()1960);1961for(int i=2; i < node->getNumChildren(); ++i)1962{1963ir.print("%s %d, label %%block_%d ",1964getTypeName(node->getChild(0)->getDataType()),1965node->getChild(i)->getCaseConstant(),1966node->getChild(i)->getBranchDestination()->getNode()->getBlock()->getNumber());1967}1968ir.print("]\n");1969node->setLocalIndex(_gpuNodeCount--);1970}1971else if (node->getOpCodeValue() == TR::table)1972{1973getNodeName(node->getChild(0), name0, self()->comp());19741975ir.print(" %s %s %s, label %%block_%d [ ",1976getOpCodeName(node->getOpCodeValue()),1977getTypeName(node->getChild(0)->getDataType()),1978name0,1979node->getChild(1)->getBranchDestination()->getNode()->getBlock()->getNumber()1980);1981for(int i=2; i < node->getNumChildren(); ++i)1982{1983ir.print("%s %d, label %%block_%d ",1984getTypeName(node->getChild(0)->getDataType()),1985i-2,1986node->getChild(i)->getBranchDestination()->getNode()->getBlock()->getNumber());1987}1988ir.print("]\n");1989node->setLocalIndex(_gpuNodeCount--);1990}1991else if (node->getOpCode().isBooleanCompare()) //Needs to be after "isIf()" check1992{1993getNodeName(node->getChild(0), name0, self()->comp());1994getNodeName(node->getChild(1), name1, self()->comp());1995const char *type0 = getTypeName(node->getChild(0)->getDataType());19961997const char *opcode = getOpCodeName(node->getOpCodeValue());19981999ir.print(" %%%d = %s %s %s, %s\n",2000node->getLocalIndex(), opcode, type0, name0, name1);2001node->setLocalIndex(_gpuNodeCount++);20022003ir.print(" %%%d = zext i1 %%%d to i32\n",2004node->getLocalIndex(),2005node->getLocalIndex()-1);2006}2007else if (node->getOpCodeValue() == TR::treetop || node->getOpCodeValue() == TR::Case)2008{2009node->setLocalIndex(_gpuNodeCount--);2010}2011else if (getOpCodeName(node->getOpCodeValue()) &&2012strcmp(getOpCodeName(node->getOpCodeValue()), "INVALID") == 0)2013{2014node->setLocalIndex(_gpuNodeCount--);2015traceMsg(self()->comp(), "INVALID operation required by node %p\n", node);2016return GPUInvalidProgram;2017}2018else2019{2020node->setLocalIndex(_gpuNodeCount--);2021traceMsg(self()->comp(), "node %p assigned index %d\n", node, node->getLocalIndex());2022traceMsg(self()->comp(), "unsupported opcode (%s) on line %d %p\n", node->getOpCode().getName(), self()->comp()->getLineNumber(node), node);2023return GPUInvalidProgram;2024}20252026return GPUSuccess;2027}202820292030void traceNVVMIR(TR::Compilation *comp, char *buffer)2031{2032traceMsg(comp, "NVVM IR:\n");2033char msg[256];2034char *cs = buffer;2035int line = 1;2036while (*cs != '\0')2037{2038char *ce = cs;2039while (*ce != '\n' && *ce != '\0')2040{2041ce++;2042}2043ce++;2044int len = (ce - cs) < 255 ? (ce - cs) : 255;2045memcpy(msg, cs, len);2046msg[len] = '\0';2047traceMsg(comp, "%6d: %s", line++, msg);2048if (*(ce - 1) == '\0')2049{2050ce--;2051}2052cs = ce;2053}2054traceMsg(comp, "\n");2055}205620572058void2059J9::CodeGenerator::findExtraParms(2060TR::Node *node,2061int32_t &numExtraParms,2062TR_SharedMemoryAnnotations *sharedMemory,2063vcount_t visitCount)2064{2065if (node->getVisitCount() == visitCount)2066return;20672068node->setVisitCount(visitCount);20692070if (node->getOpCode().isLoadIndirect() &&2071_gpuSymbolMap[node->getSymbolReference()->getReferenceNumber()]._parmSlot == -1)2072{2073TR::Node *firstChild = node->getChild(0);2074if (node->getSymbolReference()->getCPIndex() != -1) // field of some object2075{2076// TODO: check that field is an array!2077TR_ASSERT(firstChild->getOpCode().isLoadDirect() &&2078isThisPointer(firstChild->getSymbolReference()),2079"can only access a field of this object\n");20802081// TODO: handle duplicate names from different classes2082TR_SharedMemoryField field = sharedMemory->find(TR::comp(), node->getSymbolReference());20832084if (field.getSize() == 0)2085numExtraParms++;2086}2087}20882089for (int32_t i = 0; i < node->getNumChildren(); ++i)2090{2091TR::Node *child = node->getChild(i);2092self()->findExtraParms(child, numExtraParms, sharedMemory, visitCount);2093}2094}209520962097void2098J9::CodeGenerator::dumpInvariant(2099CS2::ArrayOf<gpuParameter, TR::Allocator>::Cursor pit,2100NVVMIRBuffer &ir,2101bool isbufferalign)2102{2103return;21042105for (pit.SetToFirst(); pit.Valid(); pit.SetToNext())2106{2107TR::Symbol *sym = pit->_hostSymRef->getSymbol();2108char parmName[MAX_NAME+2];2109getParmName(pit->_parmSlot, parmName, false);2110if (sym->getDataType() == TR::Address)2111{2112if (isbufferalign)2113strcat(parmName, ".t");2114ir.print(" call void @llvm.invariant.end({}* %%inv_%s_header, i64 %d, i8* %s)\n",2115&parmName[1], self()->objectHeaderInvariant(), parmName);2116}2117}2118}21192120#ifdef ENABLE_GPU2121bool calculateComputeCapability(int tracing, short* computeMajor, short* computeMinor, int deviceId);2122bool getNvvmVersion(int tracing, int* majorVersion, int* minorVersion);2123#endif21242125TR::CodeGenerator::GPUResult2126J9::CodeGenerator::dumpNVVMIR(2127TR::TreeTop *firstTreeTop,2128TR::TreeTop *lastTreeTop,2129TR_RegionStructure *loop,2130SharedSparseBitVector *blocksInLoop,2131ListBase<TR::AutomaticSymbol> *autos,2132ListBase<TR::ParameterSymbol> *parms,2133bool staticMethod,2134char * &nvvmIR,2135TR::Node * &errorNode,2136int gpuPtxCount,2137bool* hasExceptionChecks)2138{2139static bool isbufferalign = feGetEnv("TR_disableGPUBufferAlign") ? false : true;2140NVVMIRBuffer ir(self()->comp()->trMemory());2141GPUResult result;2142short computeMajor, computeMinor, computeCapability;2143int nvvmMajorVersion = 0;2144int nvvmMinorVersion = 0;21452146_gpuHasNullCheck = false;2147_gpuHasBndCheck = false;2148_gpuHasDivCheck = false;2149_gpuNodeCount = 0;2150_gpuReturnType = TR::NoType;2151_gpuPostDominators = NULL;2152_gpuStartBlock = NULL;2153_gpuNeedNullCheckArguments_vector = 0;2154_gpuCanUseReadOnlyCache = false;2155_gpuUseOldLdgCalls = false;21562157#ifdef ENABLE_GPU2158if (!calculateComputeCapability(/*tracing*/0, &computeMajor, &computeMinor, /*deviceId*/0))2159{2160traceMsg(self()->comp(), "calculateComputeCapability was unsuccessful.\n");2161return GPUHelperError;2162}2163computeCapability = 100*computeMajor + computeMinor; //combines Major and Minor versions into a single number.21642165if (computeCapability >= 305) //If compute capability is 3.5 or higher2166_gpuCanUseReadOnlyCache = true; //then the GPU is capable of using read only cache21672168if (!getNvvmVersion(/*tracing*/0, &nvvmMajorVersion, &nvvmMinorVersion))2169{2170traceMsg(self()->comp(), "getNvvmVersion was unsuccessful.\n");2171return GPUHelperError;2172}21732174/*2175* NVVM 1.3 updates LLVM support to LLVM 3.8. From LLVM 3.6 onward, ldg was changed to make alignment an explicit2176* parameter instead of as metadata. As a result, NVVM 1.2 and before uses a one parameter version of ldg while2177* NVVM 1.3 and onward uses a two parameter version.2178*/2179if (nvvmMajorVersion == 1 && nvvmMinorVersion <= 2)2180{2181_gpuUseOldLdgCalls = true;2182}2183#endif21842185TR::CFG *cfg = self()->comp()->getFlowGraph();2186TR_BitVector targetBlocks(cfg->getNumberOfNodes(), self()->comp()->trMemory(), stackAlloc, growable);21872188static bool enableExceptionCheckElimination = (feGetEnv("TR_enableGPUExceptionCheckElimination") != NULL);2189if (enableExceptionCheckElimination)2190{2191_gpuPostDominators = new (self()->comp()->trStackMemory()) TR_Dominators(self()->comp(), true);2192}2193_gpuStartBlock = toBlock(cfg->getStart());219421952196TR_SharedMemoryAnnotations sharedMemory(self()->comp());2197int32_t numExtraParms = 0;21982199// First pass through the trees2200vcount_t visitCount = self()->comp()->incVisitCount();22012202int32_t currentBlock = 0;2203int32_t firstBlock = 0;2204for (TR::TreeTop * tree = firstTreeTop; tree != lastTreeTop; tree = tree->getNextTreeTop())2205{2206if (tree->getNode()->getOpCodeValue() == TR::BBStart)2207currentBlock = tree->getNode()->getBlock()->getNumber();22082209if (blocksInLoop && !blocksInLoop->ValueAt(currentBlock))2210continue;22112212if (firstBlock == 0)2213firstBlock = currentBlock;22142215TR::Node *node = tree->getNode();2216if (node->getOpCode().isBranch())2217{2218TR_ASSERT(node->getBranchDestination()->getNode()->getOpCodeValue() == TR::BBStart, "Attempted to get Block number of a non-BBStart node.");2219targetBlocks.set(node->getBranchDestination()->getNode()->getBlock()->getNumber());22202221if (tree->getNextTreeTop() &&2222tree->getNextTreeTop()->getNextTreeTop())2223{2224node = tree->getNextTreeTop()->getNextTreeTop()->getNode();2225TR_ASSERT(node->getOpCodeValue() == TR::BBStart, "Attempted to get Block number of a non-BBStart node.");2226targetBlocks.set(node->getBlock()->getNumber());2227}2228}2229else if (node->getOpCode().isSwitch())2230{2231for (int childIndex = 0; childIndex < node->getNumChildren(); ++childIndex)2232{2233if (node->getChild(childIndex)->getOpCode().isBranch())2234{2235TR_ASSERT(node->getChild(childIndex)->getBranchDestination()->getNode()->getOpCodeValue() == TR::BBStart, "Attempted to get Block number of a non-BBStart node.");2236targetBlocks.set(node->getChild(childIndex)->getBranchDestination()->getNode()->getBlock()->getNumber());2237}2238}2239}2240else if (node->getOpCode().isReturn())2241_gpuReturnType = node->getDataType().getDataType();22422243//findExtraParms(node, numExtraParms, &sharedMemory, visitCount);2244}22452246traceMsg(self()->comp(), "extra parameters = %d\n", numExtraParms);2247ir.print("target triple = \"nvptx64-unknown-cuda\"\n");2248ir.print("target datalayout = \"e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64\"\n\n"); // TODO: 32-bit224922502251// TODO: alignment, arraylength !!!2252for(auto lit = sharedMemory.getSharedMemoryFields().begin(); lit != sharedMemory.getSharedMemoryFields().end(); ++lit)2253{2254if ((*lit).getSize() > 0)2255ir.print("@%.*s = internal addrspace(3) global [%d x %s] zeroinitializer, align 8\n",2256(*lit).getFieldNameLength(), (*lit).getFieldName(), (*lit).getSize(),2257getTypeNameFromSignature((*lit).getFieldSig(), (*lit).getFieldSigLength()));2258}22592260static bool disableReadOnlyCacheArray = (feGetEnv("TR_disableGPUReadOnlyCacheArray") != NULL);2261static bool disableReadOnlyCacheObjHdr = (feGetEnv("TR_disableGPUReadOnlyCacheObjHdr") != NULL);22622263//ir.print("@_ExceptionKind = addrspace(1) global [1 x i32 0, align 4\n");2264ir.print("@_ExceptionKind = addrspace(1) global [1 x i32] zeroinitializer, align 4\n");22652266//NVVM 1.3 onward uses a two parameter version of ldg2267if (_gpuCanUseReadOnlyCache && (!disableReadOnlyCacheArray || !disableReadOnlyCacheObjHdr))2268{2269if (_gpuUseOldLdgCalls)2270{2271ir.print("declare i8 @llvm.nvvm.ldg.global.i.i8.p1i8(i8 addrspace(1)* %%ptr)\n");2272ir.print("declare i16 @llvm.nvvm.ldg.global.i.i16.p1i16(i16 addrspace(1)* %%ptr)\n");2273ir.print("declare i32 @llvm.nvvm.ldg.global.i.i32.p1i32(i32 addrspace(1)* %%ptr)\n");2274ir.print("declare i64 @llvm.nvvm.ldg.global.i.i64.p1i64(i64 addrspace(1)* %%ptr)\n");2275ir.print("declare float @llvm.nvvm.ldg.global.f.f32.p1f32(float addrspace(1)* %%ptr)\n");2276ir.print("declare double @llvm.nvvm.ldg.global.f.f64.p1f64(double addrspace(1)* %%ptr)\n");2277ir.print("declare i8* @llvm.nvvm.ldg.global.p.p64.p1p64(i8* addrspace(1)* %%ptr)\n");2278}2279else2280{2281ir.print("declare i8 @llvm.nvvm.ldg.global.i.i8.p1i8(i8 addrspace(1)* %%ptr, i32 %%align)\n");2282ir.print("declare i16 @llvm.nvvm.ldg.global.i.i16.p1i16(i16 addrspace(1)* %%ptr, i32 %%align)\n");2283ir.print("declare i32 @llvm.nvvm.ldg.global.i.i32.p1i32(i32 addrspace(1)* %%ptr, i32 %%align)\n");2284ir.print("declare i64 @llvm.nvvm.ldg.global.i.i64.p1i64(i64 addrspace(1)* %%ptr, i32 %%align)\n");2285ir.print("declare float @llvm.nvvm.ldg.global.f.f32.p1f32(float addrspace(1)* %%ptr, i32 %%align)\n");2286ir.print("declare double @llvm.nvvm.ldg.global.f.f64.p1f64(double addrspace(1)* %%ptr, i32 %%align)\n");2287ir.print("declare i8* @llvm.nvvm.ldg.global.p.p64.p1p64(i8* addrspace(1)* %%ptr, i32 %%align)\n");2288}2289}22902291ir.print("declare {}* @llvm.invariant.start(i64 %%size, i8* nocapture %%ptr)\n");2292ir.print("declare void @llvm.invariant.end({}* %%start, i64 %%size, i8* nocapture %%ptr)\n");22932294ir.print("\ndefine %s @test%d(", getTypeName(_gpuReturnType), gpuPtxCount);22952296CS2::ArrayOf<gpuParameter, TR::Allocator> gpuParameterMap(TR::comp()->allocator());2297CS2::ArrayOf<TR::CodeGenerator::gpuMapElement, TR::Allocator>::Cursor ait(_gpuSymbolMap);22982299for (ait.SetToFirst(); ait.Valid(); ait.SetToNext())2300{2301if (!ait->_hostSymRef) continue;2302traceMsg(TR::comp(), "hostSymRef #%d parmSlot %d\n", (int)ait, ait->_parmSlot);23032304if (ait->_parmSlot != -1)2305{2306gpuParameter parm (ait->_hostSymRef, ait->_parmSlot);2307gpuParameterMap[ait->_parmSlot] = parm;2308}2309}231023112312TR::ResolvedMethodSymbol *method = self()->comp()->getJittedMethodSymbol();2313ListIterator<TR::ParameterSymbol> pi(parms);2314TR::ParameterSymbol *parm;23152316bool first = true;2317int32_t nextParmNum = staticMethod ? 0 : 1;23182319parm = pi.getFirst();2320if (!staticMethod) parm = pi.getNext();23212322char name[MAX_NAME];23232324for (; parm; parm = pi.getNext())2325{2326getAutoOrParmName(parm, name, false);23272328if (!first) ir.print(", ");2329ir.print("%s %s", getTypeName(parm->getDataType()), name);2330first = false;2331nextParmNum++;2332}233323342335CS2::ArrayOf<gpuParameter, TR::Allocator>::Cursor pit(gpuParameterMap);2336for (pit.SetToFirst(); pit.Valid(); pit.SetToNext())2337{2338getParmName(pit->_parmSlot, name, false);23392340if (!first) ir.print(", ");2341ir.print("%s %s", getTypeName(pit->_hostSymRef->getSymbol()->getDataType()), name);2342first = false;2343nextParmNum++;2344}23452346int numParms = nextParmNum - (staticMethod ? 0 : 1);23472348ir.print("%s%s%s %%ExceptionKind",2349numParms > 0 ? ", " : "",2350self()->comp()->isGPUCompilation() ? "" : "i32 %startInclusive, i32 %endExclusive, ",2351getTypeName(TR::Address));23522353ir.print(") {\n");2354ir.print("entry:\n");23552356pi.reset();2357parm = pi.getFirst();2358if (!staticMethod) parm = pi.getNext();23592360first = true;2361for (; parm; parm = pi.getNext())2362{2363char name[MAX_NAME];2364getAutoOrParmName(parm, name);2365ir.print(" %s = alloca %s, align %d\n",2366name,2367getTypeName(parm->getDataType()),2368parm->getSize());23692370char origName[MAX_NAME];2371getAutoOrParmName(parm, origName, false);2372ir.print(" store %s %s, %s* %s, align %d\n",2373getTypeName(parm->getDataType()),2374origName,2375getTypeName(parm->getDataType()),2376name,2377parm->getSize());2378}237923802381for (pit.SetToFirst(); pit.Valid(); pit.SetToNext())2382{2383TR::Symbol *sym = pit->_hostSymRef->getSymbol();2384char addrName[MAX_NAME+2];2385getParmName(pit->_parmSlot, addrName);2386ir.print(" %s = alloca %s, align %d\n",2387addrName,2388getTypeName(sym->getDataType()),2389sym->getSize());23902391char parmName[MAX_NAME];2392getParmName(pit->_parmSlot, parmName, false);2393if (sym->getDataType() == TR::Address)2394{2395if (isbufferalign)2396{2397char name[MAX_NAME];2398strcpy(name, parmName);2399strcat(parmName, ".t");2400ir.print(" %s = getelementptr inbounds i8* %s, i32 %d\n",2401parmName,2402name,2403GPUAlignment - TR::Compiler->om.contiguousArrayHeaderSizeInBytes());2404}2405ir.print(" %%inv_%s_header = call {}* @llvm.invariant.start(i64 %d, i8* %s)\n",2406&parmName[1], self()->objectHeaderInvariant(), parmName);2407}2408ir.print(" store %s %s, %s* %s, align %d\n",2409getTypeName(sym->getDataType()),2410parmName,2411getTypeName(sym->getDataType()),2412addrName,2413sym->getSize());24142415}241624172418ListIterator<TR::AutomaticSymbol> ai(autos);2419uint16_t liveLocalIndex = 0;2420for (TR::AutomaticSymbol *a = ai.getFirst(); a != NULL; a = ai.getNext())2421{2422ir.print(" %%a%d.addr = alloca %s, align %d\n",2423liveLocalIndex,2424getTypeName(a->getDataType()),2425a->getSize());2426a->setLiveLocalIndex(liveLocalIndex++, 0);2427}242824292430if (!self()->comp()->isGPUCompilation())2431{2432ir.print(" %%0 = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x()\n");2433ir.print(" %%1 = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()\n");2434ir.print(" %%2 = mul i32 %%0, %%1\n");2435ir.print(" %%3 = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()\n");2436ir.print(" %%4 = add i32 %%2, %%3\n");2437ir.print(" %%5 = add i32 %%startInclusive, %%4\n");2438ir.print(" store i32 %%5, i32* %%%s.addr, align 4\n", "a0");2439ir.print(" %%6 = icmp slt i32 %%5, %%endExclusive\n");2440ir.print(" br i1 %%6, label %%block_%d, label %%block_0\n", firstBlock);2441ir.print("block_0:\n");2442self()->dumpInvariant(pit, ir, isbufferalign);2443ir.print(" ret void\n");24442445_gpuNodeCount = 7;2446}24472448// print all trees2449visitCount = self()->comp()->incVisitCount();24502451for (TR::TreeTop * tree = firstTreeTop; tree != lastTreeTop; tree = tree->getNextTreeTop())2452{2453TR::Node *node = tree->getNode();24542455if (node->getOpCodeValue() == TR::BBStart)2456currentBlock = node->getBlock()->getNumber();24572458if (blocksInLoop && !blocksInLoop->ValueAt(currentBlock))2459continue;24602461// don't print the backedge2462if (node->getOpCode().isBranch() &&2463node->getBranchDestination()->getNode()->getBlock()->getNumber() == firstBlock)2464{2465self()->dumpInvariant(pit, ir, isbufferalign);2466ir.print(" ret void\n");2467continue;2468}24692470result = self()->printNVVMIR(ir, tree->getNode(), loop, &targetBlocks, visitCount, &sharedMemory, nextParmNum, errorNode);2471if (result != GPUSuccess)2472{2473return result;2474}2475}24762477if (_gpuReturnType == TR::NoType)2478{2479self()->dumpInvariant(pit, ir, isbufferalign);2480ir.print(" ret void\n");2481}24822483_gpuNodeCount++;24842485if (_gpuNeedNullCheckArguments_vector != 0)2486{2487ir.print("; needNullCheckArguments_vector=");2488int32_t len = sizeof(uint64_t) * CHAR_BIT;2489for (int32_t i = len - 1; i >= 0; i--)2490{2491ir.print("%u", (_gpuNeedNullCheckArguments_vector >> (uint64_t)i) & 1);2492}2493ir.print("\n");2494}24952496if (_gpuHasNullCheck)2497{2498ir.print("NullException:\n");2499//ir.print(" store i32 1, i32 addrspace(1)* @_ExceptionKind, align 4\n");2500ir.print(" %%%d = bitcast i8* %%ExceptionKind to i32*\n", _gpuNodeCount);2501ir.print(" store i32 %d, i32 * %%%d, align 4\n", GPUNullCheck, _gpuNodeCount++);2502self()->dumpInvariant(pit, ir, isbufferalign);2503ir.print(" ret void\n");2504}2505if (_gpuHasBndCheck)2506{2507ir.print("BndException:\n");2508//ir.print(" store i32 2, i32 addrspace(1)* @_ExceptionKind, align 4\n");2509ir.print(" %%%d = bitcast i8* %%ExceptionKind to i32*\n", _gpuNodeCount);2510ir.print(" store i32 %d, i32 * %%%d, align 4\n", GPUBndCheck, _gpuNodeCount++);2511self()->dumpInvariant(pit, ir, isbufferalign);2512ir.print(" ret void\n");2513}2514if (_gpuHasDivCheck)2515{2516ir.print("DivException:\n");2517//ir.print(" store i32 3, i32 addrspace(1)* @_ExceptionKind, align 4\n");2518ir.print(" %%%d = bitcast i8* %%ExceptionKind to i32*\n", _gpuNodeCount);2519ir.print(" store i32 %d, i32 * %%%d, align 4\n", GPUDivException, _gpuNodeCount++);2520self()->dumpInvariant(pit, ir, isbufferalign);2521ir.print(" ret void\n");2522}25232524ir.print("}\n");25252526ir.print("declare i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() nounwind readnone\n");2527ir.print("declare i32 @llvm.nvvm.read.ptx.sreg.ctaid.y() nounwind readnone\n");2528ir.print("declare i32 @llvm.nvvm.read.ptx.sreg.ctaid.z() nounwind readnone\n");25292530ir.print("declare i32 @llvm.nvvm.read.ptx.sreg.ntid.x() nounwind readnone\n");2531ir.print("declare i32 @llvm.nvvm.read.ptx.sreg.ntid.y() nounwind readnone\n");2532ir.print("declare i32 @llvm.nvvm.read.ptx.sreg.ntid.z() nounwind readnone\n");25332534ir.print("declare i32 @llvm.nvvm.read.ptx.sreg.tid.x() nounwind readnone\n");2535ir.print("declare i32 @llvm.nvvm.read.ptx.sreg.tid.y() nounwind readnone\n");2536ir.print("declare i32 @llvm.nvvm.read.ptx.sreg.tid.z() nounwind readnone\n");25372538if (self()->comp()->getOptions()->getEnableGPU(TR_EnableGPUEnableMath))2539{2540ir.print("declare double @__nv_sin(double)\n");2541ir.print("declare double @__nv_cos(double)\n");2542ir.print("declare double @__nv_sqrt(double)\n");2543ir.print("declare double @__nv_log(double)\n");2544ir.print("declare double @__nv_exp(double)\n");2545ir.print("declare double @__nv_fabs(double)\n");2546ir.print("declare float @__nv_fabsf(float)\n");2547}25482549ir.print("declare void @llvm.nvvm.barrier0() nounwind readnone\n");25502551ir.print("!10 = metadata !{i32 0}\n");2552ir.print("!11 = metadata !{i32 1}\n");2553ir.print("!12 = metadata !{i32 2}\n");2554ir.print("!14 = metadata !{i32 4}\n");2555ir.print("!18 = metadata !{i32 8}\n");25562557ir.print("!nvvmir.version = !{!0}\n");2558ir.print("!0 = metadata !{i32 1, i32 0}\n");25592560ir.print("!nvvm.annotations = !{!1}\n");2561ir.print("!1 = metadata !{%s (", getTypeName(_gpuReturnType));2562pi.reset();2563parm = pi.getFirst();2564if (!staticMethod) parm = pi.getNext();25652566first = true;2567for (; parm; parm = pi.getNext())2568{2569if (!first) ir.print(", ");2570ir.print("%s", getTypeName(parm->getDataType()));2571first = false;2572}25732574for (pit.SetToFirst(); pit.Valid(); pit.SetToNext())2575{2576TR::Symbol *sym = pit->_hostSymRef->getSymbol();2577if (!first) ir.print(", ");2578ir.print("%s", getTypeName(sym->getDataType()));2579first = false;2580}25812582ir.print("%s%s%s",2583numParms > 0 ? ", " : "",2584self()->comp()->isGPUCompilation() ? "" : "i32, i32, ", // startInclusive, endExclusive2585getTypeName(TR::Address)); // for ExceptionCheck25862587ir.print(")* @test%d, metadata !\"kernel\", i32 1}\n", gpuPtxCount);25882589nvvmIR = ir.getString();25902591traceNVVMIR(self()->comp(), nvvmIR);25922593//if any of these are set, it means this kernel may trigger a Java exception2594*hasExceptionChecks = (_gpuHasNullCheck || _gpuHasBndCheck || _gpuHasDivCheck);25952596return GPUSuccess;2597}259825992600void2601J9::CodeGenerator::generateGPU()2602{2603if (self()->comp()->isGPUCompilation())2604{2605char *programSource;2606TR::Node *errorNode;2607GPUResult result;2608TR::ResolvedMethodSymbol *method = self()->comp()->getJittedMethodSymbol();26092610{2611TR::StackMemoryRegion stackMemoryRegion(*self()->trMemory());26122613result = self()->dumpNVVMIR(self()->comp()->getStartTree(), self()->comp()->findLastTree(),2614NULL,2615NULL,2616&method->getAutomaticList(),2617&method->getParameterList(),2618false, // TODO: check if method is static2619programSource, errorNode, 0, 0); //gpuPtxCount is not applicable here so it is always set to 0.26202621} // scope of the stack memory region26222623self()->comp()->getOptimizationPlan()->setGPUResult(result);26242625if (result == GPUSuccess)2626{2627self()->comp()->getOptimizationPlan()->setGPUIR(programSource);2628}26292630if (!self()->comp()->isGPUCompileCPUCode())2631return;26322633TR::CFG *cfg = self()->comp()->getFlowGraph();2634TR::Block *startBlock = self()->comp()->getStartBlock();2635startBlock->split(self()->comp()->getStartTree()->getNextTreeTop(), cfg, false, false);26362637ListAppender<TR::ParameterSymbol> la(&method->getParameterList());2638la.empty(); // empty current parameter list26392640TR::ParameterSymbol *parmSymbol;2641int slot = 0;26422643parmSymbol = method->comp()->getSymRefTab()->createParameterSymbol(method, slot, TR::Address);2644parmSymbol->setOrdinal(slot++);2645parmSymbol->setReferencedParameter();2646parmSymbol->setLinkageRegisterIndex(0);2647parmSymbol->setTypeSignature("", 0);2648la.add(parmSymbol);26492650parmSymbol = method->comp()->getSymRefTab()->createParameterSymbol(method, slot, TR::Address);2651parmSymbol->setOrdinal(slot++);2652parmSymbol->setReferencedParameter();2653parmSymbol->setLinkageRegisterIndex(1);2654parmSymbol->setTypeSignature("", 0);2655la.add(parmSymbol);26562657parmSymbol = method->comp()->getSymRefTab()->createParameterSymbol(method, slot, TR::Address);2658parmSymbol->setOrdinal(slot++);2659parmSymbol->setReferencedParameter();2660parmSymbol->setLinkageRegisterIndex(2);2661parmSymbol->setTypeSignature("", 0);2662la.add(parmSymbol);26632664parmSymbol = method->comp()->getSymRefTab()->createParameterSymbol(method, slot, TR::Address);2665parmSymbol->setOrdinal(slot++);2666parmSymbol->setReferencedParameter();2667parmSymbol->setLinkageRegisterIndex(3);2668parmSymbol->setTypeSignature("", 0);2669la.add(parmSymbol);26702671parmSymbol = method->comp()->getSymRefTab()->createParameterSymbol(method, slot, TR::Int32);2672parmSymbol->setOrdinal(slot++);2673parmSymbol->setLinkageRegisterIndex(4);2674parmSymbol->setTypeSignature("", 0);2675la.add(parmSymbol);26762677parmSymbol = method->comp()->getSymRefTab()->createParameterSymbol(method, slot, TR::Int32);2678parmSymbol->setOrdinal(slot++);2679parmSymbol->setReferencedParameter();2680parmSymbol->setLinkageRegisterIndex(5);2681parmSymbol->setTypeSignature("", 0);2682la.add(parmSymbol);26832684parmSymbol = method->comp()->getSymRefTab()->createParameterSymbol(method, slot, TR::Int32);2685parmSymbol->setOrdinal(slot++);2686parmSymbol->setReferencedParameter();2687parmSymbol->setLinkageRegisterIndex(6);2688parmSymbol->setTypeSignature("", 0);2689la.add(parmSymbol);26902691parmSymbol = method->comp()->getSymRefTab()->createParameterSymbol(method, slot, TR::Int32);2692parmSymbol->setOrdinal(slot++);2693parmSymbol->setReferencedParameter();2694parmSymbol->setLinkageRegisterIndex(7);2695parmSymbol->setTypeSignature("", 0);2696la.add(parmSymbol);26972698parmSymbol = method->comp()->getSymRefTab()->createParameterSymbol(method, slot, TR::Int32);2699parmSymbol->setOrdinal(slot++);2700parmSymbol->setReferencedParameter();2701parmSymbol->setTypeSignature("", 0);2702la.add(parmSymbol);27032704parmSymbol = method->comp()->getSymRefTab()->createParameterSymbol(method, slot, TR::Int32);2705parmSymbol->setOrdinal(slot++);2706parmSymbol->setReferencedParameter();2707parmSymbol->setTypeSignature("", 0);2708la.add(parmSymbol);27092710parmSymbol = method->comp()->getSymRefTab()->createParameterSymbol(method, slot, TR::Int32);2711parmSymbol->setOrdinal(slot++);2712parmSymbol->setReferencedParameter();2713parmSymbol->setTypeSignature("", 0);2714la.add(parmSymbol);271527162717parmSymbol = method->comp()->getSymRefTab()->createParameterSymbol(method, slot, TR::Int32);2718parmSymbol->setOrdinal(slot++);2719parmSymbol->setReferencedParameter();2720parmSymbol->setTypeSignature("", 0);2721la.add(parmSymbol);27222723parmSymbol = method->comp()->getSymRefTab()->createParameterSymbol(method, slot, TR::Address);2724parmSymbol->setOrdinal(slot++);2725parmSymbol->setReferencedParameter();2726parmSymbol->setTypeSignature("", 0);2727la.add(parmSymbol);27282729TR::Node *callNode, *parm;2730TR::SymbolReference *parmSymRef;2731callNode = TR::Node::create(self()->comp()->getStartTree()->getNode(), TR::icall, 13);27322733parm = TR::Node::create(callNode, TR::aload, 0); // vmThread2734parmSymRef = self()->comp()->getSymRefTab()->findOrCreateAutoSymbol(method, 0, TR::Address);2735parm->setSymbolReference(parmSymRef);2736callNode->setAndIncChild(0, parm);27372738parm = TR::Node::create(callNode, TR::aload, 0); // method2739parmSymRef = self()->comp()->getSymRefTab()->findOrCreateAutoSymbol(method, 1, TR::Address);2740parm->setSymbolReference(parmSymRef);2741callNode->setAndIncChild(1, parm);27422743parm = TR::Node::create(callNode, TR::aload, 0); // programSource2744parmSymRef = self()->comp()->getSymRefTab()->findOrCreateAutoSymbol(method, 2, TR::Address);2745parm->setSymbolReference(parmSymRef);2746callNode->setAndIncChild(2, parm);27472748parm = TR::Node::create(callNode, TR::aload, 0); // invokeObject2749parmSymRef = self()->comp()->getSymRefTab()->findOrCreateAutoSymbol(method, 3, TR::Address);2750parm->setSymbolReference(parmSymRef);2751callNode->setAndIncChild(3, parm);27522753parm = TR::Node::create(callNode, TR::iload, 0); // deviceId2754parmSymRef = self()->comp()->getSymRefTab()->findOrCreateAutoSymbol(method, 4, TR::Int32);2755parm->setSymbolReference(parmSymRef);2756callNode->setAndIncChild(4, parm);27572758parm = TR::Node::create(callNode, TR::iload, 0); // gridDimX2759parmSymRef = self()->comp()->getSymRefTab()->findOrCreateAutoSymbol(method, 5, TR::Int32);2760parm->setSymbolReference(parmSymRef);2761callNode->setAndIncChild(5, parm);27622763parm = TR::Node::create(callNode, TR::iload, 0); // gridDimY2764parmSymRef = self()->comp()->getSymRefTab()->findOrCreateAutoSymbol(method, 6, TR::Int32);2765parm->setSymbolReference(parmSymRef);2766callNode->setAndIncChild(6, parm);27672768parm = TR::Node::create(callNode, TR::iload, 0); // gridDimZ2769parmSymRef = self()->comp()->getSymRefTab()->findOrCreateAutoSymbol(method, 7, TR::Int32);2770parm->setSymbolReference(parmSymRef);2771callNode->setAndIncChild(7, parm);27722773parm = TR::Node::create(callNode, TR::iload, 0); // blockDimX2774parmSymRef = self()->comp()->getSymRefTab()->findOrCreateAutoSymbol(method, 8, TR::Int32);2775parm->setSymbolReference(parmSymRef);2776callNode->setAndIncChild(8, parm);27772778parm = TR::Node::create(callNode, TR::iload, 0); // blockDimY2779parmSymRef = self()->comp()->getSymRefTab()->findOrCreateAutoSymbol(method, 9, TR::Int32);2780parm->setSymbolReference(parmSymRef);2781callNode->setAndIncChild(9, parm);27822783parm = TR::Node::create(callNode, TR::iload, 0); // blockDimZ2784parmSymRef = self()->comp()->getSymRefTab()->findOrCreateAutoSymbol(method, 10, TR::Int32);2785parm->setSymbolReference(parmSymRef);2786callNode->setAndIncChild(10, parm);27872788parm = TR::Node::create(callNode, TR::iload, 0); // argCount2789parmSymRef = self()->comp()->getSymRefTab()->findOrCreateAutoSymbol(method, 11, TR::Int32);2790parm->setSymbolReference(parmSymRef);2791callNode->setAndIncChild(11, parm);27922793parm = TR::Node::create(callNode, TR::aload, 0); // args2794parmSymRef = self()->comp()->getSymRefTab()->findOrCreateAutoSymbol(method, 12, TR::Address);2795parm->setSymbolReference(parmSymRef);2796callNode->setAndIncChild(12, parm);27972798TR::SymbolReference *helper = self()->comp()->getSymRefTab()->findOrCreateRuntimeHelper(TR_callGPU);2799helper->getSymbol()->castToMethodSymbol()->setLinkage(TR_System);2800callNode->setSymbolReference(helper);2801TR::Node *treetop = TR::Node::create(callNode, TR::treetop, 1);2802treetop->setAndIncChild(0, callNode);2803TR::TreeTop *callTreeTop = TR::TreeTop::create(self()->comp(), treetop);2804self()->comp()->getStartTree()->insertAfter(callTreeTop);28052806TR::Node *returnNode = TR::Node::create(callNode, TR::ireturn, 1); // TODO: handle mismatching returns2807returnNode->setAndIncChild(0, callNode);2808TR::TreeTop *returnTreeTop = TR::TreeTop::create(self()->comp(), returnNode);2809callTreeTop->insertAfter(returnTreeTop);28102811}2812}28132814uintptr_t2815J9::CodeGenerator::objectLengthOffset()2816{2817return self()->fe()->getOffsetOfContiguousArraySizeField();2818}28192820uintptr_t2821J9::CodeGenerator::objectHeaderInvariant()2822{2823return self()->objectLengthOffset() + 4 /*length*/ ;2824}282528262827