Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
PojavLauncherTeam
GitHub Repository: PojavLauncherTeam/openj9
Path: blob/master/runtime/compiler/codegen/CodeGenGPU.cpp
6000 views
1
/*******************************************************************************
2
* Copyright (c) 2000, 2021 IBM Corp. and others
3
*
4
* This program and the accompanying materials are made available under
5
* the terms of the Eclipse Public License 2.0 which accompanies this
6
* distribution and is available at https://www.eclipse.org/legal/epl-2.0/
7
* or the Apache License, Version 2.0 which accompanies this distribution and
8
* is available at https://www.apache.org/licenses/LICENSE-2.0.
9
*
10
* This Source Code may also be made available under the following
11
* Secondary Licenses when the conditions for such availability set
12
* forth in the Eclipse Public License, v. 2.0 are satisfied: GNU
13
* General Public License, version 2 with the GNU Classpath
14
* Exception [1] and GNU General Public License, version 2 with the
15
* OpenJDK Assembly Exception [2].
16
*
17
* [1] https://www.gnu.org/software/classpath/license.html
18
* [2] http://openjdk.java.net/legal/assembly-exception.html
19
*
20
* SPDX-License-Identifier: EPL-2.0 OR Apache-2.0 OR GPL-2.0 WITH Classpath-exception-2.0 OR LicenseRef-GPL-2.0 WITH Assembly-exception
21
*******************************************************************************/
22
23
/**
24
* Support code for TR::CodeGenerator. Code related to generating GPU
25
*/
26
27
#include "codegen/CodeGenerator.hpp"
28
#include "codegen/CodeGenerator_inlines.hpp"
29
#include "codegen/RecognizedMethods.hpp"
30
#include "il/AutomaticSymbol.hpp"
31
#include "il/Block.hpp"
32
#include "il/Node.hpp"
33
#include "il/Node_inlines.hpp"
34
#include "il/ParameterSymbol.hpp"
35
#include "il/TreeTop.hpp"
36
#include "il/TreeTop_inlines.hpp"
37
#include "infra/String.hpp"
38
#include "env/CompilerEnv.hpp"
39
#include "env/StackMemoryRegion.hpp"
40
#include "env/annotations/GPUAnnotation.hpp"
41
#include "optimizer/Dominators.hpp"
42
#include "optimizer/Structure.hpp"
43
#include "omrformatconsts.h"
44
45
#define OPT_DETAILS "O^O CODE GENERATION: "
46
47
static const char* getOpCodeName(TR::ILOpCodes opcode) {
48
49
TR_ASSERT(opcode < TR::NumIlOps, "Wrong opcode");
50
51
switch(opcode)
52
{
53
case TR::iload:
54
case TR::fload:
55
case TR::dload:
56
case TR::aload:
57
case TR::bload:
58
case TR::sload:
59
case TR::lload:
60
case TR::iloadi:
61
case TR::floadi:
62
case TR::dloadi:
63
case TR::aloadi:
64
case TR::bloadi:
65
case TR::sloadi:
66
case TR::lloadi:
67
return "load";
68
69
case TR::istore:
70
case TR::lstore:
71
case TR::fstore:
72
case TR::dstore:
73
case TR::astore:
74
case TR::bstore:
75
case TR::sstore:
76
case TR::lstorei:
77
case TR::fstorei:
78
case TR::dstorei:
79
case TR::astorei:
80
case TR::bstorei:
81
case TR::sstorei:
82
case TR::istorei:
83
return "store";
84
85
case TR::Goto:
86
return "br";
87
88
case TR::ireturn:
89
case TR::lreturn:
90
case TR::freturn:
91
case TR::dreturn:
92
case TR::areturn:
93
case TR::Return:
94
return "ret";
95
96
case TR::iadd:
97
case TR::ladd:
98
case TR::badd:
99
case TR::sadd:
100
return "add";
101
102
case TR::fadd:
103
case TR::dadd:
104
return "fadd";
105
106
case TR::isub:
107
case TR::lsub:
108
case TR::bsub:
109
case TR::ssub:
110
case TR::ineg:
111
case TR::lneg:
112
case TR::bneg:
113
case TR::sneg:
114
return "sub";
115
116
case TR::dsub:
117
case TR::fsub:
118
case TR::fneg:
119
case TR::dneg:
120
return "fsub";
121
122
case TR::imul:
123
case TR::lmul:
124
case TR::bmul:
125
case TR::smul:
126
return "mul";
127
128
case TR::fmul:
129
case TR::dmul:
130
return "fmul";
131
132
case TR::idiv:
133
case TR::ldiv:
134
case TR::bdiv:
135
case TR::sdiv:
136
return "sdiv";
137
138
case TR::fdiv:
139
case TR::ddiv:
140
return "fdiv";
141
142
case TR::iudiv:
143
case TR::ludiv:
144
return "udiv";
145
146
case TR::irem:
147
case TR::lrem:
148
case TR::brem:
149
case TR::srem:
150
return "srem";
151
152
case TR::frem:
153
case TR::drem:
154
return "frem";
155
156
case TR::iurem:
157
return "urem";
158
159
case TR::ishl:
160
case TR::lshl:
161
case TR::bshl:
162
case TR::sshl:
163
return "shl";
164
165
case TR::ishr:
166
case TR::lshr:
167
case TR::bshr:
168
case TR::sshr:
169
return "ashr";
170
171
case TR::iushr:
172
case TR::lushr:
173
case TR::bushr:
174
case TR::sushr:
175
return "lshr";
176
177
case TR::iand:
178
case TR::land:
179
case TR::band:
180
case TR::sand:
181
return "and";
182
183
case TR::ior:
184
case TR::lor:
185
case TR::bor:
186
case TR::sor:
187
return "or";
188
189
case TR::ixor:
190
case TR::lxor:
191
case TR::bxor:
192
case TR::sxor:
193
return "xor";
194
195
case TR::i2l:
196
case TR::b2i:
197
case TR::b2l:
198
case TR::b2s:
199
case TR::s2i:
200
case TR::s2l:
201
return "sext";
202
203
case TR::i2f:
204
case TR::i2d:
205
case TR::l2f:
206
case TR::l2d:
207
case TR::b2f:
208
case TR::b2d:
209
case TR::s2f:
210
case TR::s2d:
211
return "sitofp";
212
213
case TR::i2b:
214
case TR::i2s:
215
case TR::l2i:
216
case TR::l2b:
217
case TR::l2s:
218
case TR::s2b:
219
return "trunc";
220
221
case TR::l2a:
222
case TR::i2a:
223
case TR::s2a:
224
case TR::b2a:
225
case TR::lu2a:
226
case TR::iu2a:
227
case TR::su2a:
228
case TR::bu2a:
229
return "inttoptr";
230
231
case TR::iu2l:
232
case TR::bu2i:
233
case TR::bu2l:
234
case TR::bu2s:
235
case TR::su2i:
236
case TR::su2l:
237
return "zext";
238
239
case TR::iu2f:
240
case TR::iu2d:
241
case TR::lu2f:
242
case TR::lu2d:
243
case TR::bu2f:
244
case TR::bu2d:
245
case TR::su2f:
246
case TR::su2d:
247
return "uitofp";
248
249
case TR::f2i:
250
case TR::f2l:
251
case TR::f2b:
252
case TR::f2s:
253
case TR::d2i:
254
case TR::d2l:
255
case TR::d2b:
256
case TR::d2s:
257
return "fptosi";
258
259
case TR::f2d:
260
return "fpext";
261
262
case TR::d2f:
263
return "fptrunc";
264
265
case TR::a2i:
266
case TR::a2l:
267
case TR::a2b:
268
case TR::a2s:
269
return "ptrtoint";
270
271
case TR::icmpeq:
272
case TR::lcmpeq:
273
case TR::acmpeq:
274
case TR::bcmpeq:
275
case TR::scmpeq:
276
case TR::ificmpeq:
277
case TR::iflcmpeq:
278
case TR::ifacmpeq:
279
case TR::ifbcmpeq:
280
case TR::ifscmpeq:
281
return "icmp eq";
282
283
case TR::icmpne:
284
case TR::lcmpne:
285
case TR::acmpne:
286
case TR::bcmpne:
287
case TR::scmpne:
288
case TR::ificmpne:
289
case TR::iflcmpne:
290
case TR::ifacmpne:
291
case TR::ifbcmpne:
292
case TR::ifscmpne:
293
return "icmp ne";
294
295
case TR::icmplt:
296
case TR::lcmplt:
297
case TR::bcmplt:
298
case TR::scmplt:
299
case TR::ificmplt:
300
case TR::iflcmplt:
301
case TR::ifbcmplt:
302
case TR::ifscmplt:
303
return "icmp slt";
304
305
case TR::icmpge:
306
case TR::lcmpge:
307
case TR::bcmpge:
308
case TR::scmpge:
309
case TR::ificmpge:
310
case TR::iflcmpge:
311
case TR::ifbcmpge:
312
case TR::ifscmpge:
313
return "icmp sge";
314
315
case TR::icmpgt:
316
case TR::lcmpgt:
317
case TR::bcmpgt:
318
case TR::scmpgt:
319
case TR::ificmpgt:
320
case TR::iflcmpgt:
321
case TR::ifbcmpgt:
322
case TR::ifscmpgt:
323
return "icmp sgt";
324
325
case TR::icmple:
326
case TR::lcmple:
327
case TR::bcmple:
328
case TR::scmple:
329
case TR::ificmple:
330
case TR::iflcmple:
331
case TR::ifbcmple:
332
case TR::ifscmple:
333
return "icmp sle";
334
335
case TR::acmplt:
336
case TR::iucmplt:
337
case TR::lucmplt:
338
case TR::bucmplt:
339
case TR::sucmplt:
340
case TR::ifacmplt:
341
case TR::ifiucmplt:
342
case TR::iflucmplt:
343
case TR::ifbucmplt:
344
case TR::ifsucmplt:
345
return "icmp ult";
346
347
case TR::acmpge:
348
case TR::iucmpge:
349
case TR::bucmpge:
350
case TR::lucmpge:
351
case TR::sucmpge:
352
case TR::ifacmpge:
353
case TR::ifiucmpge:
354
case TR::iflucmpge:
355
case TR::ifbucmpge:
356
case TR::ifsucmpge:
357
return "icmp uge";
358
359
case TR::acmpgt:
360
case TR::iucmpgt:
361
case TR::lucmpgt:
362
case TR::bucmpgt:
363
case TR::sucmpgt:
364
case TR::ifacmpgt:
365
case TR::ifiucmpgt:
366
case TR::iflucmpgt:
367
case TR::ifbucmpgt:
368
case TR::ifsucmpgt:
369
return "icmp ugt";
370
371
case TR::acmple:
372
case TR::iucmple:
373
case TR::lucmple:
374
case TR::bucmple:
375
case TR::sucmple:
376
case TR::ifacmple:
377
case TR::ifiucmple:
378
case TR::iflucmple:
379
case TR::ifbucmple:
380
case TR::ifsucmple:
381
return "icmp ule";
382
383
case TR::fcmpeq:
384
case TR::dcmpeq:
385
case TR::iffcmpeq:
386
case TR::ifdcmpeq:
387
return "fcmp oeq";
388
389
case TR::fcmpne:
390
case TR::dcmpne:
391
case TR::iffcmpne:
392
case TR::ifdcmpne:
393
return "fcmp one";
394
395
case TR::fcmplt:
396
case TR::dcmplt:
397
case TR::iffcmplt:
398
case TR::ifdcmplt:
399
return "fcmp olt";
400
401
case TR::fcmpge:
402
case TR::dcmpge:
403
case TR::iffcmpge:
404
case TR::ifdcmpge:
405
return "fcmp oge";
406
407
case TR::fcmpgt:
408
case TR::dcmpgt:
409
case TR::iffcmpgt:
410
case TR::ifdcmpgt:
411
return "fcmp ogt";
412
413
case TR::fcmple:
414
case TR::dcmple:
415
case TR::iffcmple:
416
case TR::ifdcmple:
417
return "fcmp ole";
418
419
case TR::fcmpequ:
420
case TR::dcmpequ:
421
case TR::iffcmpequ:
422
case TR::ifdcmpequ:
423
return "fcmp ueq";
424
425
case TR::fcmpneu:
426
case TR::dcmpneu:
427
case TR::iffcmpneu:
428
case TR::ifdcmpneu:
429
return "fcmp une";
430
431
case TR::fcmpltu:
432
case TR::dcmpltu:
433
case TR::iffcmpltu:
434
case TR::ifdcmpltu:
435
return "fcmp ult";
436
437
case TR::fcmpgeu:
438
case TR::dcmpgeu:
439
case TR::iffcmpgeu:
440
case TR::ifdcmpgeu:
441
return "fcmp uge";
442
443
case TR::fcmpgtu:
444
case TR::dcmpgtu:
445
case TR::iffcmpgtu:
446
case TR::ifdcmpgtu:
447
return "fcmp ugt";
448
449
case TR::fcmpleu:
450
case TR::dcmpleu:
451
case TR::iffcmpleu:
452
case TR::ifdcmpleu:
453
return "fcmp ule";
454
455
case TR::d2c:
456
case TR::f2c:
457
case TR::f2bu:
458
case TR::f2iu:
459
case TR::f2lu:
460
case TR::d2iu:
461
case TR::d2lu:
462
case TR::d2bu:
463
return "fptoui";
464
465
case TR::aiadd:
466
case TR::aladd:
467
return "getelementptr";
468
469
case TR::ibits2f:
470
case TR::fbits2i:
471
case TR::lbits2d:
472
case TR::dbits2l:
473
return "bitcast";
474
475
case TR::lookup:
476
case TR::table:
477
return "switch";
478
479
case TR::BBStart:
480
case TR::BBEnd:
481
return "";
482
483
case TR::newarray:
484
return "INVALID";
485
486
default:
487
return NULL;
488
}
489
490
}
491
492
493
char *nvvmTypeNames[TR::NumTypes] =
494
{
495
"void", // "TR::NoType"
496
"i8", // "TR::Int8"
497
"i16", // "TR::Int16"
498
"i32", // "TR::Int32"
499
"i64", // "TR::Int64"
500
"float", // "TR::Float"
501
"double", // "TR::Double"
502
"i8*" // "TR::Address"
503
};
504
505
static const char* getTypeName(TR::DataType type) {
506
if (type >= TR::NoType && type <= TR::Address)
507
{
508
return nvvmTypeNames[type];
509
}
510
else
511
{
512
TR_ASSERT(false, "Unsupported type");
513
return "???";
514
}
515
}
516
517
char *nvvmVarTypeNames[TR::NumTypes] =
518
{
519
"void", // "TR::NoType"
520
"i8", // "TR::Int8"
521
"i16", // "TR::Int16"
522
"i32", // "TR::Int32"
523
"i64", // "TR::Int64"
524
"f32", // "TR::Float"
525
"f64", // "TR::Double"
526
"p64" // "TR::Address"
527
};
528
529
static const char* getVarTypeName(TR::DataType type) {
530
if (type >= TR::NoType && type <= TR::Address)
531
{
532
return nvvmVarTypeNames[type];
533
}
534
else
535
{
536
TR_ASSERT(false, "Unsupported type");
537
return "???";
538
}
539
}
540
541
#define MAX_NAME 256
542
543
544
static void getParmName(int32_t slot, char * s, bool addr = true)
545
{
546
TR::snprintfNoTrunc(s, MAX_NAME, "%%p%" OMR_PRId32 "%s", slot, addr ? ".addr" : "");
547
}
548
549
550
static void getAutoOrParmName(TR::Symbol *sym, char * s, bool addr = true)
551
{
552
TR_ASSERT(sym->isAutoOrParm(), "expecting auto or parm");
553
554
if (sym->isParm())
555
TR::snprintfNoTrunc(s, MAX_NAME, "%%p%" OMR_PRId32 "%s", sym->castToParmSymbol()->getSlot(), addr ? ".addr" : "");
556
else
557
TR::snprintfNoTrunc(s, MAX_NAME, "%%a%" OMR_PRId32 "%s", sym->castToAutoSymbol()->getLiveLocalIndex(), addr ? ".addr" : "");
558
}
559
560
561
#define INIT_BUFFER_SIZE 65535
562
563
class NVVMIRBuffer
564
{
565
public:
566
NVVMIRBuffer(TR_Memory* mem)
567
{
568
m = mem;
569
size = INIT_BUFFER_SIZE;
570
buffer = (char*)m->allocateHeapMemory(size);
571
s = buffer;
572
}
573
void print(char *format, ...)
574
{
575
va_list args;
576
va_start (args, format);
577
int32_t left = size - (s - buffer);
578
579
va_list args_copy;
580
va_copy(args_copy, args);
581
int32_t len = vsnprintf(s, left, format, args_copy);
582
va_copy_end(args_copy);
583
584
if ((len + 1) > left)
585
{
586
expand(len + 1 - left);
587
left = size - (s - buffer);
588
len = vsnprintf(s, left, format, args);
589
}
590
591
s += len;
592
va_end(args);
593
}
594
595
char * getString() { return buffer; }
596
597
private:
598
599
void expand(int32_t min)
600
{
601
size += (min >= size) ? size*2 : size;
602
603
char * newBuffer = (char*)m->allocateHeapMemory(size);
604
memcpy(newBuffer, buffer, s - buffer);
605
s = newBuffer + (s - buffer);
606
buffer = newBuffer;
607
}
608
609
char *buffer;
610
char *s;
611
int32_t size;
612
TR_Memory* m;
613
};
614
615
616
static void getNodeName(TR::Node* node, char * s, TR::Compilation *comp)
617
{
618
if (node->getOpCode().isLoadConst())
619
{
620
bool isUnsigned = node->getOpCode().isUnsigned();
621
switch (node->getDataType())
622
{
623
case TR::Int8:
624
if(isUnsigned)
625
TR::snprintfNoTrunc(s, MAX_NAME, "%" OMR_PRIu8, node->getUnsignedByte());
626
else
627
TR::snprintfNoTrunc(s, MAX_NAME, "%" OMR_PRId8, node->getByte());
628
break;
629
case TR::Int16:
630
TR::snprintfNoTrunc(s, MAX_NAME, "%" OMR_PRIu16, node->getConst<uint16_t>());
631
break;
632
case TR::Int32:
633
if(isUnsigned)
634
TR::snprintfNoTrunc(s, MAX_NAME, "%" OMR_PRIu32, node->getUnsignedInt());
635
else
636
TR::snprintfNoTrunc(s, MAX_NAME, "%" OMR_PRId32, node->getInt());
637
break;
638
case TR::Int64:
639
if(isUnsigned)
640
TR::snprintfNoTrunc(s, MAX_NAME, "%" OMR_PRIu64, node->getUnsignedLongInt());
641
else
642
TR::snprintfNoTrunc(s, MAX_NAME, "%" OMR_PRId64, node->getLongInt());
643
break;
644
case TR::Float:
645
union
646
{
647
double doubleValue;
648
int64_t doubleBits;
649
};
650
doubleValue = node->getFloat();
651
TR::snprintfNoTrunc(s, MAX_NAME, "0x%016" OMR_PRIx64, doubleBits);
652
break;
653
case TR::Double:
654
TR::snprintfNoTrunc(s, MAX_NAME, "0x%016" OMR_PRIx64, node->getDoubleBits());
655
break;
656
case TR::Address:
657
if (node->getAddress() == 0)
658
TR::snprintfNoTrunc(s, MAX_NAME, "null");
659
else
660
TR_ASSERT(0, "Non-null Address constants should not occur.\n");
661
break;
662
default:
663
TR_ASSERT(0, "Unknown/unimplemented data type\n");
664
}
665
}
666
else
667
{
668
TR::snprintfNoTrunc(s, MAX_NAME, "%%%" OMR_PRIu32, node->getLocalIndex());
669
}
670
}
671
672
char* getNVVMMathFunctionName(TR::Node *node)
673
{
674
switch (((TR::MethodSymbol*)node->getSymbolReference()->getSymbol())->getRecognizedMethod())
675
{
676
case TR::java_lang_Math_sqrt:
677
return "sqrt";
678
case TR::java_lang_Math_sin:
679
case TR::java_lang_StrictMath_sin:
680
return "sin";
681
case TR::java_lang_Math_cos:
682
case TR::java_lang_StrictMath_cos:
683
return "cos";
684
case TR::java_lang_Math_log:
685
case TR::java_lang_StrictMath_log:
686
return "log";
687
case TR::java_lang_Math_exp:
688
case TR::java_lang_StrictMath_exp:
689
return "exp";
690
case TR::java_lang_Math_abs_F:
691
return "fabsf";
692
case TR::java_lang_Math_abs_D:
693
return "fabs";
694
default:
695
return "ERROR";
696
}
697
return "ERROR";
698
}
699
700
bool J9::CodeGenerator::handleRecognizedMethod(TR::Node *node, NVVMIRBuffer &ir, TR::Compilation *comp)
701
{
702
char name0[MAX_NAME];
703
switch (((TR::MethodSymbol*)node->getSymbolReference()->getSymbol())->getRecognizedMethod())
704
{
705
case TR::com_ibm_gpu_Kernel_blockIdxX:
706
ir.print(" %%%d = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x()\n", node->getLocalIndex());
707
break;
708
case TR::com_ibm_gpu_Kernel_blockIdxY:
709
ir.print(" %%%d = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.y()\n", node->getLocalIndex());
710
break;
711
case TR::com_ibm_gpu_Kernel_blockIdxZ:
712
ir.print(" %%%d = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.z()\n", node->getLocalIndex());
713
break;
714
case TR::com_ibm_gpu_Kernel_blockDimX:
715
ir.print(" %%%d = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()\n", node->getLocalIndex());
716
break;
717
case TR::com_ibm_gpu_Kernel_blockDimY:
718
ir.print(" %%%d = call i32 @llvm.nvvm.read.ptx.sreg.ntid.y()\n", node->getLocalIndex());
719
break;
720
case TR::com_ibm_gpu_Kernel_blockDimZ:
721
ir.print(" %%%d = call i32 @llvm.nvvm.read.ptx.sreg.ntid.z()\n", node->getLocalIndex());
722
break;
723
case TR::com_ibm_gpu_Kernel_threadIdxX:
724
ir.print(" %%%d = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()\n", node->getLocalIndex());
725
break;
726
case TR::com_ibm_gpu_Kernel_threadIdxY:
727
ir.print(" %%%d = call i32 @llvm.nvvm.read.ptx.sreg.tid.y()\n", node->getLocalIndex());
728
break;
729
case TR::com_ibm_gpu_Kernel_threadIdxZ:
730
ir.print(" %%%d = call i32 @llvm.nvvm.read.ptx.sreg.tid.z()\n", node->getLocalIndex());
731
break;
732
case TR::com_ibm_gpu_Kernel_syncThreads:
733
ir.print(" call void @llvm.nvvm.barrier0()\n");
734
node->setLocalIndex(_gpuNodeCount--);
735
break;
736
case TR::java_lang_Math_sqrt:
737
case TR::java_lang_Math_sin:
738
case TR::java_lang_Math_cos:
739
case TR::java_lang_Math_log:
740
case TR::java_lang_Math_exp:
741
case TR::java_lang_Math_abs_D:
742
if (!comp->getOptions()->getEnableGPU(TR_EnableGPUEnableMath)) return false;
743
getNodeName(node->getChild(0), name0, comp);
744
ir.print(" %%%d = call double @__nv_%s(double %s)\n", node->getLocalIndex(), getNVVMMathFunctionName(node), name0);
745
break;
746
case TR::java_lang_StrictMath_sin:
747
case TR::java_lang_StrictMath_cos:
748
case TR::java_lang_StrictMath_log:
749
case TR::java_lang_StrictMath_exp:
750
if (!comp->getOptions()->getEnableGPU(TR_EnableGPUEnableMath)) return false;
751
getNodeName(node->getChild(1), name0, comp);
752
ir.print(" %%%d = call double @__nv_%s(double %s)\n", node->getLocalIndex(), getNVVMMathFunctionName(node), name0);
753
break;
754
case TR::java_lang_Math_abs_F:
755
if (!comp->getOptions()->getEnableGPU(TR_EnableGPUEnableMath)) return false;
756
getNodeName(node->getChild(0), name0, comp);
757
ir.print(" %%%d = call float @__nv_%s(float %s)\n", node->getLocalIndex(), getNVVMMathFunctionName(node), name0);
758
break;
759
default:
760
return false;
761
}
762
return true;
763
}
764
765
766
bool J9::CodeGenerator::handleRecognizedField(TR::Node *node, NVVMIRBuffer &ir)
767
{
768
switch (node->getSymbolReference()->getSymbol()->getRecognizedField())
769
{
770
case TR::Symbol::Com_ibm_gpu_Kernel_blockIdxX:
771
ir.print(" %%%d = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x()\n", node->getLocalIndex());
772
break;
773
case TR::Symbol::Com_ibm_gpu_Kernel_blockIdxY:
774
ir.print(" %%%d = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.y()\n", node->getLocalIndex());
775
break;
776
case TR::Symbol::Com_ibm_gpu_Kernel_blockIdxZ:
777
ir.print(" %%%d = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.z()\n", node->getLocalIndex());
778
break;
779
case TR::Symbol::Com_ibm_gpu_Kernel_blockDimX:
780
ir.print(" %%%d = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()\n", node->getLocalIndex());
781
break;
782
case TR::Symbol::Com_ibm_gpu_Kernel_blockDimY:
783
ir.print(" %%%d = call i32 @llvm.nvvm.read.ptx.sreg.ntid.y()\n", node->getLocalIndex());
784
break;
785
case TR::Symbol::Com_ibm_gpu_Kernel_blockDimZ:
786
ir.print(" %%%d = call i32 @llvm.nvvm.read.ptx.sreg.ntid.z()\n", node->getLocalIndex());
787
break;
788
case TR::Symbol::Com_ibm_gpu_Kernel_threadIdxX:
789
ir.print(" %%%d = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()\n", node->getLocalIndex());
790
break;
791
case TR::Symbol::Com_ibm_gpu_Kernel_threadIdxY:
792
ir.print(" %%%d = call i32 @llvm.nvvm.read.ptx.sreg.tid.y()\n", node->getLocalIndex());
793
break;
794
case TR::Symbol::Com_ibm_gpu_Kernel_threadIdxZ:
795
ir.print(" %%%d = call i32 @llvm.nvvm.read.ptx.sreg.tid.z()\n", node->getLocalIndex());
796
break;
797
case TR::Symbol::Com_ibm_gpu_Kernel_syncThreads:
798
ir.print(" call void @llvm.nvvm.barrier0()\n");
799
node->setLocalIndex(_gpuNodeCount--);
800
break;
801
default:
802
return false;
803
}
804
return true;
805
}
806
807
void J9::CodeGenerator::printArrayCopyNVVMIR(TR::Node *node, NVVMIRBuffer &ir, TR::Compilation *comp)
808
{
809
//Some forms of array copy have five children. First two nodes are used for write barriers which we don't need
810
// Three child version
811
// child 0 ------ Source byte address;
812
// child 1 ------ Destination byte address;
813
// child 2 ------ Copy length in byte;
814
// Five child version:
815
// child 0 ------ Source array object; (skipped)
816
// child 1 ------ Destination array object; (skipped)
817
// child 2 ------ Source byte address;
818
// child 3 ------ Destination byte address;
819
// child 4 ------ Copy length in byte;
820
//childrenNodeOffset is set such that we access Source byte address, Destination byte address and Copy length.
821
int childrenNodeOffset = node->getNumChildren() == 5 ? 2 : 0;
822
823
char name0[MAX_NAME], name1[MAX_NAME], name2[MAX_NAME];
824
getNodeName(node->getChild(0+childrenNodeOffset), name0, comp);
825
getNodeName(node->getChild(1+childrenNodeOffset), name1, comp);
826
getNodeName(node->getChild(2+childrenNodeOffset), name2, comp);
827
828
int arrayCopyID = node->getLocalIndex();
829
bool isWordCopy = node->chkWordElementArrayCopy();
830
bool isHalfwordCopy = node->chkHalfWordElementArrayCopy();
831
bool unknownCopy = !(isWordCopy || isHalfwordCopy);
832
bool isBackwardsCopy = !node->isForwardArrayCopy();
833
bool is64bitCopyLength = (node->getChild(2+childrenNodeOffset)->getDataType() == TR::Int64);
834
835
/* Example NVVM IR:
836
837
; Inputs to the array copy that come from the children:
838
%8 = getelementptr inbounds i8* %7, i64 76 ; Source addr
839
%10 = getelementptr inbounds i8* %9, i64 76 ; Destination addr
840
%14 = mul i64 %13, 2 ; Copy Length in bytes
841
842
; Generated ArrayCopy NVVM IR
843
; This is a reverse halfword array copy
844
br label %ArrayCopy15
845
ArrayCopy15:
846
%15 = ptrtoint i8* %8 to i64 ; Generated for reverse array copy.
847
%16 = ptrtoint i8* %10 to i64 ; Changes source and destination
848
%17 = add i64 %15, %14 ; to point to the end of the array
849
%18 = add i64 %16, %14 ; These lines are not generated
850
%19 = sub i64 %17, 2 ; for a forward array copy
851
%20 = sub i64 %18, 2 ;
852
%21 = inttoptr i64 %19 to i8* ;
853
%22 = inttoptr i64 %20 to i8* ;
854
br label %ArrayCopyHeader15
855
ArrayCopyHeader15:
856
%23 = phi i64 [ %14, %ArrayCopy15 ], [ %36, %ArrayCopyBody15 ] ; Phi nodes save a different value to the temp
857
%24 = phi i8* [ %21, %ArrayCopy15 ], [ %34, %ArrayCopyBody15 ] ; based on the name of the previous block before
858
%25 = phi i8* [ %22, %ArrayCopy15 ], [ %35, %ArrayCopyBody15 ] ; jumping to ArrayCopyHeader15
859
%26 = bitcast i8* %24 to i16*
860
%27 = bitcast i8* %25 to i16*
861
%28 = icmp sle i64 %23, 0
862
br i1 %28, label %AfterArrayCopy15, label %ArrayCopyBody15 ; branch to exit if no more work to do
863
ArrayCopyBody15:
864
%29 = load i16* %26 ; load data from input array
865
store i16 %29, i16* %27 ; store data to output array
866
%30 = ptrtoint i16* %26 to i64
867
%31 = ptrtoint i16* %27 to i64
868
%32 = sub i64 %30, 2 ; sub is used for reverse copy, add used for forward copy
869
%33 = sub i64 %31, 2 ; sub is used for reverse copy, add used for forward copy
870
%34 = inttoptr i64 %32 to i8*
871
%35 = inttoptr i64 %33 to i8*
872
%36 = sub i64 %23, 2 ; decrement copy length
873
br label %ArrayCopyHeader15
874
AfterArrayCopy15:
875
*/
876
877
878
//create a new block so the phi nodes know the name of the preceding block
879
ir.print(" br label %%ArrayCopy%d\n", arrayCopyID);
880
ir.print("ArrayCopy%d:\n", arrayCopyID);
881
882
//for a backwards copy, the source and destination pointers need to be adjusted to
883
//point to the last element.
884
if (isBackwardsCopy)
885
{
886
if (!is64bitCopyLength)
887
{
888
ir.print(" %%%d = sext %s %s to i64\n",
889
node->getLocalIndex(),
890
getTypeName(node->getChild(2+childrenNodeOffset)->getDataType()),
891
name2);
892
node->setLocalIndex(_gpuNodeCount++);
893
}
894
895
ir.print(" %%%d = ptrtoint %s %s to i64\n",
896
node->getLocalIndex(),
897
getTypeName(node->getChild(0+childrenNodeOffset)->getDataType()),
898
name0);
899
node->setLocalIndex(_gpuNodeCount++);
900
901
ir.print(" %%%d = ptrtoint %s %s to i64\n",
902
node->getLocalIndex(),
903
getTypeName(node->getChild(1+childrenNodeOffset)->getDataType()),
904
name1);
905
node->setLocalIndex(_gpuNodeCount++);
906
907
if (is64bitCopyLength)
908
{
909
ir.print(" %%%d = add i64 %%%d, %s\n",
910
node->getLocalIndex(),
911
node->getLocalIndex()-2,
912
name2);
913
node->setLocalIndex(_gpuNodeCount++);
914
915
ir.print(" %%%d = add i64 %%%d, %s\n",
916
node->getLocalIndex(),
917
node->getLocalIndex()-2,
918
name2);
919
node->setLocalIndex(_gpuNodeCount++);
920
}
921
else
922
{
923
ir.print(" %%%d = add i64 %%%d, %%%d\n",
924
node->getLocalIndex(),
925
node->getLocalIndex()-2,
926
node->getLocalIndex()-3);
927
node->setLocalIndex(_gpuNodeCount++);
928
929
ir.print(" %%%d = add i64 %%%d, %%%d\n",
930
node->getLocalIndex(),
931
node->getLocalIndex()-2,
932
node->getLocalIndex()-4);
933
node->setLocalIndex(_gpuNodeCount++);
934
}
935
936
ir.print(" %%%d = sub i64 %%%d, %d\n",
937
node->getLocalIndex(),
938
node->getLocalIndex()-2,
939
isWordCopy ? 4 : isHalfwordCopy ? 2 : 1);
940
node->setLocalIndex(_gpuNodeCount++);
941
942
ir.print(" %%%d = sub i64 %%%d, %d\n",
943
node->getLocalIndex(),
944
node->getLocalIndex()-2,
945
isWordCopy ? 4 : isHalfwordCopy ? 2 : 1);
946
node->setLocalIndex(_gpuNodeCount++);
947
948
ir.print(" %%%d = inttoptr i64 %%%d to %s\n",
949
node->getLocalIndex(),
950
node->getLocalIndex()-2,
951
getTypeName(node->getChild(0+childrenNodeOffset)->getDataType()));
952
node->setLocalIndex(_gpuNodeCount++);
953
954
ir.print(" %%%d = inttoptr i64 %%%d to %s\n",
955
node->getLocalIndex(),
956
node->getLocalIndex()-2,
957
getTypeName(node->getChild(1+childrenNodeOffset)->getDataType()));
958
node->setLocalIndex(_gpuNodeCount++);
959
}
960
961
ir.print(" br label %%ArrayCopyHeader%d\n", arrayCopyID);
962
ir.print("ArrayCopyHeader%d:\n", arrayCopyID);
963
964
//copy length in bytes
965
ir.print(" %%%d = phi %s [ %s, %%ArrayCopy%d ], [ %%%d, %%ArrayCopyBody%d ]\n",
966
node->getLocalIndex(),
967
getTypeName(node->getChild(2+childrenNodeOffset)->getDataType()),
968
name2,
969
arrayCopyID,
970
unknownCopy ? node->getLocalIndex()+11 : node->getLocalIndex()+13,
971
arrayCopyID);
972
node->setLocalIndex(_gpuNodeCount++);
973
974
if (!isBackwardsCopy)
975
{
976
//source address
977
ir.print(" %%%d = phi %s [ %s, %%ArrayCopy%d ], [ %%%d, %%ArrayCopyBody%d ]\n",
978
node->getLocalIndex(),
979
getTypeName(node->getChild(0+childrenNodeOffset)->getDataType()),
980
name0,
981
arrayCopyID,
982
unknownCopy ? node->getLocalIndex()+8 : node->getLocalIndex()+10,
983
arrayCopyID);
984
node->setLocalIndex(_gpuNodeCount++);
985
986
//destination address
987
ir.print(" %%%d = phi %s [ %s, %%ArrayCopy%d ], [ %%%d, %%ArrayCopyBody%d ]\n",
988
node->getLocalIndex(),
989
getTypeName(node->getChild(1+childrenNodeOffset)->getDataType()),
990
name1,
991
arrayCopyID,
992
unknownCopy ? node->getLocalIndex()+8 : node->getLocalIndex()+10,
993
arrayCopyID);
994
node->setLocalIndex(_gpuNodeCount++);
995
}
996
else
997
{
998
//source address
999
ir.print(" %%%d = phi %s [ %%%d, %%ArrayCopy%d ], [ %%%d, %%ArrayCopyBody%d ]\n",
1000
node->getLocalIndex(),
1001
getTypeName(node->getChild(0+childrenNodeOffset)->getDataType()),
1002
node->getLocalIndex()-3,
1003
arrayCopyID,
1004
unknownCopy ? node->getLocalIndex()+8 : node->getLocalIndex()+10,
1005
arrayCopyID);
1006
node->setLocalIndex(_gpuNodeCount++);
1007
1008
//destination address
1009
ir.print(" %%%d = phi %s [ %%%d, %%ArrayCopy%d ], [ %%%d, %%ArrayCopyBody%d ]\n",
1010
node->getLocalIndex(),
1011
getTypeName(node->getChild(1+childrenNodeOffset)->getDataType()),
1012
node->getLocalIndex()-3,
1013
arrayCopyID,
1014
unknownCopy ? node->getLocalIndex()+8 : node->getLocalIndex()+10,
1015
arrayCopyID);
1016
node->setLocalIndex(_gpuNodeCount++);
1017
}
1018
1019
//change pointer types from i8* if copying halfword or word data
1020
if (isWordCopy || isHalfwordCopy)
1021
{
1022
ir.print(" %%%d = bitcast %s %%%d to %s\n",
1023
node->getLocalIndex(),
1024
getTypeName(node->getChild(0+childrenNodeOffset)->getDataType()),
1025
node->getLocalIndex()-2,
1026
isWordCopy ? "i32*" : "i16*");
1027
node->setLocalIndex(_gpuNodeCount++);
1028
1029
ir.print(" %%%d = bitcast %s %%%d to %s\n",
1030
node->getLocalIndex(),
1031
getTypeName(node->getChild(1+childrenNodeOffset)->getDataType()),
1032
node->getLocalIndex()-2,
1033
isWordCopy ? "i32*" : "i16*");
1034
node->setLocalIndex(_gpuNodeCount++);
1035
}
1036
1037
//check if byte length is less than or equal to zero and skip the copy if true
1038
ir.print(" %%%d = icmp sle %s %%%d, 0\n",
1039
node->getLocalIndex(),
1040
getTypeName(node->getChild(2+childrenNodeOffset)->getDataType()),
1041
unknownCopy ? node->getLocalIndex()-3 : node->getLocalIndex()-5);
1042
node->setLocalIndex(_gpuNodeCount++);
1043
1044
ir.print(" br i1 %%%d, label %%AfterArrayCopy%d, label %%ArrayCopyBody%d\n",
1045
node->getLocalIndex()-1,
1046
arrayCopyID,
1047
arrayCopyID);
1048
1049
ir.print("ArrayCopyBody%d:\n", arrayCopyID);
1050
1051
//load data to copy
1052
ir.print(" %%%d = load %s %%%d\n",
1053
node->getLocalIndex(),
1054
isWordCopy ? "i32*" : isHalfwordCopy ? "i16*" : getTypeName(node->getChild(0+childrenNodeOffset)->getDataType()),
1055
node->getLocalIndex()-3);
1056
node->setLocalIndex(_gpuNodeCount++);
1057
1058
//store loaded data
1059
ir.print(" store %s %%%d, %s %%%d\n",
1060
isWordCopy ? "i32" : isHalfwordCopy ? "i16" : "i8",
1061
node->getLocalIndex()-1,
1062
isWordCopy ? "i32*" : isHalfwordCopy ? "i16*" : getTypeName(node->getChild(1+childrenNodeOffset)->getDataType()),
1063
node->getLocalIndex()-3);
1064
1065
ir.print(" %%%d = ptrtoint %s %%%d to i64\n",
1066
node->getLocalIndex(),
1067
isWordCopy ? "i32*" : isHalfwordCopy ? "i16*" : getTypeName(node->getChild(0+childrenNodeOffset)->getDataType()),
1068
node->getLocalIndex()-4);
1069
node->setLocalIndex(_gpuNodeCount++);
1070
1071
ir.print(" %%%d = ptrtoint %s %%%d to i64\n",
1072
node->getLocalIndex(),
1073
isWordCopy ? "i32*" : isHalfwordCopy ? "i16*" : getTypeName(node->getChild(1+childrenNodeOffset)->getDataType()),
1074
node->getLocalIndex()-4);
1075
node->setLocalIndex(_gpuNodeCount++);
1076
1077
//move source pointer
1078
ir.print(" %%%d = %s i64 %%%d, %d\n",
1079
node->getLocalIndex(),
1080
isBackwardsCopy ? "sub" : "add",
1081
node->getLocalIndex()-2,
1082
isWordCopy ? 4 : isHalfwordCopy ? 2 : 1);
1083
node->setLocalIndex(_gpuNodeCount++);
1084
1085
//move destination pointer
1086
ir.print(" %%%d = %s i64 %%%d, %d\n",
1087
node->getLocalIndex(),
1088
isBackwardsCopy ? "sub" : "add",
1089
node->getLocalIndex()-2,
1090
isWordCopy ? 4 : isHalfwordCopy ? 2 : 1);
1091
node->setLocalIndex(_gpuNodeCount++);
1092
1093
ir.print(" %%%d = inttoptr i64 %%%d to %s\n",
1094
node->getLocalIndex(),
1095
node->getLocalIndex()-2,
1096
getTypeName(node->getChild(0+childrenNodeOffset)->getDataType()));
1097
node->setLocalIndex(_gpuNodeCount++);
1098
1099
ir.print(" %%%d = inttoptr i64 %%%d to %s\n",
1100
node->getLocalIndex(),
1101
node->getLocalIndex()-2,
1102
getTypeName(node->getChild(1+childrenNodeOffset)->getDataType()));
1103
node->setLocalIndex(_gpuNodeCount++);
1104
1105
//decrement copy length
1106
ir.print(" %%%d = sub %s %%%d, %d\n",
1107
node->getLocalIndex(),
1108
getTypeName(node->getChild(2+childrenNodeOffset)->getDataType()),
1109
unknownCopy ? node->getLocalIndex()-11 : node->getLocalIndex()-13,
1110
isWordCopy ? 4 : isHalfwordCopy ? 2 : 1);
1111
1112
ir.print(" br label %%ArrayCopyHeader%d\n", arrayCopyID);
1113
ir.print("AfterArrayCopy%d:\n", arrayCopyID);
1114
}
1115
1116
bool isThisPointer(TR::SymbolReference * symRef)
1117
{
1118
return symRef->getSymbol()->isParm() &&
1119
((TR::ParameterSymbol *)symRef->getSymbol())->getSlot() == 0;
1120
}
1121
1122
char * getTypeNameFromSignature(char* sig, int32_t sigLength)
1123
{
1124
TR_ASSERT(sigLength == 2 && sig[0] == '[', "only handling static shared arrays");
1125
switch (sig[1])
1126
{
1127
case 'Z': return "i8";
1128
case 'B': return "i8";
1129
case 'C': return "i16";
1130
case 'S': return "i16";
1131
case 'I': return "i32";
1132
case 'J': return "i64";
1133
case 'F': return "float";
1134
case 'D': return "double";
1135
}
1136
TR_ASSERT(false, "unsupported shared array type\n");
1137
return NULL;
1138
}
1139
1140
static bool isSharedMemory(TR::Node *node, TR_SharedMemoryAnnotations *sharedMemory, TR::Compilation *comp)
1141
{
1142
if (!comp->isGPUCompilation()) return false;
1143
1144
TR::SymbolReference *symRef = node->getSymbolReference();
1145
if (!symRef->getSymbol()->isAutoOrParm() && symRef->getCPIndex() != -1)
1146
{
1147
TR_SharedMemoryField field = sharedMemory->find(comp, symRef);
1148
if (field.getSize() > 0) return true;
1149
}
1150
return false;
1151
}
1152
1153
1154
TR::CodeGenerator::GPUResult
1155
J9::CodeGenerator::printNVVMIR(
1156
NVVMIRBuffer &ir,
1157
TR::Node * node,
1158
TR_RegionStructure *loop,
1159
TR_BitVector *targetBlocks,
1160
vcount_t visitCount,
1161
TR_SharedMemoryAnnotations *sharedMemory,
1162
int32_t &nextParmNum,
1163
TR::Node * &errorNode)
1164
{
1165
GPUResult result;
1166
1167
static bool enableExceptionChecks = (feGetEnv("TR_disableGPUExceptionCheck") == NULL);
1168
TR::ILOpCode opcode = node->getOpCode();
1169
1170
char name0[MAX_NAME], name1[MAX_NAME];
1171
bool isGenerated = false;
1172
bool printChildrenWithRefCount1 = true;
1173
1174
if (node->isProfilingCode())
1175
{
1176
// Nothing to generate for profiling code, but we still need to visit the children
1177
// We can skip over the children with a reference count of one since they aren't used anywhere else.
1178
isGenerated = true;
1179
printChildrenWithRefCount1 = false;
1180
}
1181
1182
if (node->getOpCodeValue() == TR::compressedRefs)
1183
{
1184
if (loop->isExprInvariant(node))
1185
return GPUSuccess; // ignore for now
1186
node = node->getFirstChild();
1187
}
1188
1189
if (node->getOpCodeValue() == TR::treetop)
1190
node = node->getFirstChild();
1191
1192
if (self()->comp()->isGPUCompilation() &&
1193
opcode.isLoadVarDirect() &&
1194
isThisPointer(node->getSymbolReference()))
1195
return GPUSuccess; // will handle in the parent
1196
1197
if (opcode.isLoadConst())
1198
{
1199
if((node->getDataType() == TR::Address) && (node->getAddress() != 0))
1200
{
1201
traceMsg(self()->comp(), "Load Const with a non-zero address in node %p\n", node);
1202
return GPUInvalidProgram;
1203
}
1204
else
1205
{
1206
return GPUSuccess; // will handle in the parent
1207
}
1208
}
1209
1210
if (node->getOpCodeValue() == TR::asynccheck)
1211
return GPUSuccess;
1212
1213
if (!enableExceptionChecks &&
1214
(opcode.isNullCheck() || opcode.isBndCheck() || node->getOpCodeValue() == TR::DIVCHK))
1215
return GPUSuccess;
1216
1217
1218
if (node->getVisitCount() == visitCount)
1219
return GPUSuccess;
1220
1221
node->setVisitCount(visitCount);
1222
1223
if (opcode.isNullCheck())
1224
{
1225
1226
TR::Node *refNode = node->getNullCheckReference();
1227
1228
if (isSharedMemory(refNode, sharedMemory, self()->comp()))
1229
{
1230
// Shared Memory is always allocated
1231
ir.print("; DELETE NULLCHK [%p] since this reference [%p] is allocated in shared memory\n",
1232
node, refNode);
1233
return GPUSuccess;
1234
}
1235
if (_gpuPostDominators && _gpuPostDominators->dominates(_gpuCurrentBlock, _gpuStartBlock))
1236
{
1237
TR::SymbolReference *symRef = refNode->getSymbolReference();
1238
if (symRef->getSymbol()->isParm())
1239
{
1240
int32_t argpos = symRef->getCPIndex();
1241
ir.print("; DELETE NULLCHK [%p] (ref[%p] is parm %d) since BB[%d] postdominates BB[%d]\n",
1242
node, refNode, argpos, _gpuCurrentBlock->getNumber(), _gpuStartBlock->getNumber());
1243
_gpuNeedNullCheckArguments_vector |= (1L << (uint64_t)argpos);
1244
return GPUSuccess;
1245
}
1246
}
1247
1248
result = self()->printNVVMIR(ir, refNode, loop, targetBlocks, visitCount, sharedMemory, nextParmNum, errorNode);
1249
if (result != GPUSuccess) return result;
1250
1251
getNodeName(refNode, name0, self()->comp());
1252
const char *type0 = getTypeName(refNode->getDataType());
1253
1254
node->setLocalIndex(_gpuNodeCount++);
1255
1256
ir.print(" %%%d = icmp eq %s %s, null\n",
1257
node->getLocalIndex(), type0, name0, name1);
1258
ir.print(" br i1 %%%d, label %%NullException, label %%nullchk_fallthru_%d, !prof !0\n",
1259
node->getLocalIndex(), node->getLocalIndex());
1260
ir.print("nullchk_fallthru_%d:\n", node->getLocalIndex());
1261
1262
_gpuHasNullCheck = true;
1263
isGenerated = true;
1264
}
1265
else if (opcode.isBndCheck())
1266
{
1267
bool isSMReference = false;
1268
int32_t smsize = -1;
1269
if (node->getChild(0)->getOpCodeValue() == TR::arraylength)
1270
{
1271
TR::Node *refNode = node->getChild(0)->getChild(0);
1272
if (isSharedMemory(refNode, sharedMemory, self()->comp()))
1273
{
1274
TR_SharedMemoryField field = sharedMemory->find(self()->comp(), refNode->getSymbolReference());
1275
smsize = field.getSize();
1276
TR_ASSERT(smsize > 0, "should be annotated as shared array with positive size");
1277
1278
isSMReference = true;
1279
ir.print("; USE CONSTANT LENGTH %d for NULLCHK [%p] since this reference [%p] is allocated in shared memory\n",
1280
smsize, node, refNode);
1281
}
1282
}
1283
1284
if (!isSMReference)
1285
{
1286
result = self()->printNVVMIR(ir, node->getChild(0), loop, targetBlocks, visitCount, sharedMemory, nextParmNum, errorNode);
1287
if (result != GPUSuccess) return result;
1288
}
1289
result = self()->printNVVMIR(ir, node->getChild(1), loop, targetBlocks, visitCount, sharedMemory, nextParmNum, errorNode);
1290
if (result != GPUSuccess) return result;
1291
1292
if (!isSMReference)
1293
{
1294
getNodeName(node->getChild(0), name0, self()->comp());
1295
}
1296
else
1297
{
1298
TR::snprintfNoTrunc(name0, MAX_NAME, "%" OMR_PRId32, smsize);
1299
}
1300
1301
getNodeName(node->getChild(1), name1, self()->comp());
1302
const char *type0 = getTypeName(node->getChild(0)->getDataType());
1303
1304
node->setLocalIndex(_gpuNodeCount++);
1305
1306
ir.print(" %%%d = icmp ule %s %s, %s\n",
1307
node->getLocalIndex(), type0, name0, name1);
1308
ir.print(" br i1 %%%d, label %%BndException, label %%bndchk_fallthru_%d, !prof !0\n",
1309
node->getLocalIndex(), node->getLocalIndex());
1310
ir.print("bndchk_fallthru_%d:\n", node->getLocalIndex());
1311
1312
_gpuHasBndCheck = true;
1313
isGenerated = true;
1314
}
1315
else if (node->getOpCodeValue() == TR::DIVCHK)
1316
{
1317
TR::Node *idivNode = node->getChild(0);
1318
result = self()->printNVVMIR(ir, idivNode->getChild(0), loop, targetBlocks, visitCount, sharedMemory, nextParmNum, errorNode);
1319
if (result != GPUSuccess) return result;
1320
1321
result = self()->printNVVMIR(ir, idivNode->getChild(1), loop, targetBlocks, visitCount, sharedMemory, nextParmNum, errorNode);
1322
if (result != GPUSuccess) return result;
1323
1324
getNodeName(idivNode->getChild(1), name0, self()->comp());
1325
const char *type0 = getTypeName(idivNode->getChild(1)->getDataType());
1326
1327
node->setLocalIndex(_gpuNodeCount++);
1328
1329
ir.print(" %%%d = icmp eq %s %s, 0\n",
1330
node->getLocalIndex(), type0, name0);
1331
ir.print(" br i1 %%%d, label %%DivException, label %%divchk_fallthru_%d, !prof !0\n",
1332
node->getLocalIndex(), node->getLocalIndex());
1333
ir.print("divchk_fallthru_%d:\n", node->getLocalIndex());
1334
1335
_gpuHasDivCheck = true;
1336
isGenerated = true;
1337
}
1338
1339
// This symbol reference should become a parameter
1340
// children should be skipped (they are loop invariant)
1341
if (node->getOpCode().isLoadVar() &&
1342
_gpuSymbolMap[node->getSymbolReference()->getReferenceNumber()]._parmSlot != -1)
1343
{
1344
getParmName(_gpuSymbolMap[node->getSymbolReference()->getReferenceNumber()]._parmSlot, name0);
1345
1346
node->setLocalIndex(_gpuNodeCount++);
1347
traceMsg(self()->comp(), "node %p assigned index %d\n", node, node->getLocalIndex());
1348
1349
ir.print(" %%%d = %s %s* %s, align %d\n",
1350
node->getLocalIndex(),
1351
getOpCodeName(node->getOpCodeValue()),
1352
getTypeName(node->getDataType()),
1353
name0,
1354
node->getSize());
1355
return GPUSuccess;
1356
}
1357
1358
//Don't run printNVVMIR on a children node if:
1359
//(they are the child of a profiling call) AND ((have a reference count less then two) OR (is a loadConst node))
1360
for (int32_t i = 0; i < node->getNumChildren(); ++i)
1361
{
1362
TR::Node *child = node->getChild(i);
1363
if ((child->getReferenceCount() >= 2 && !child->getOpCode().isLoadConst()) || printChildrenWithRefCount1)
1364
{
1365
result = self()->printNVVMIR(ir, child, loop, targetBlocks, visitCount, sharedMemory, nextParmNum, errorNode);
1366
if (result != GPUSuccess)
1367
return result;
1368
}
1369
}
1370
1371
if (isGenerated)
1372
{
1373
return GPUSuccess;
1374
}
1375
1376
node->setLocalIndex(_gpuNodeCount++);
1377
traceMsg(self()->comp(), "node %p assigned index %d\n", node, node->getLocalIndex());
1378
1379
1380
if (node->getOpCodeValue() == TR::PassThrough)
1381
{
1382
node->setLocalIndex(_gpuNodeCount--);
1383
return GPUSuccess;
1384
}
1385
else if (node->getOpCodeValue() == TR::BBStart)
1386
{
1387
node->setLocalIndex(_gpuNodeCount--);
1388
_gpuCurrentBlock = node->getBlock();
1389
if (targetBlocks->get(_gpuCurrentBlock->getNumber()))
1390
ir.print("block_%d:\n", _gpuCurrentBlock->getNumber());
1391
}
1392
// if block has a label previous block has to end with a branch
1393
else if (node->getOpCodeValue() == TR::BBEnd &&
1394
!_gpuCurrentBlock->endsInBranch() &&
1395
!_gpuCurrentBlock->getLastRealTreeTop()->getNode()->getOpCode().isReturn() &&
1396
!_gpuCurrentBlock->getLastRealTreeTop()->getNode()->getOpCode().isGoto() &&
1397
!_gpuCurrentBlock->getLastRealTreeTop()->getNode()->getOpCode().isSwitch() &&
1398
_gpuCurrentBlock->getNextBlock() &&
1399
targetBlocks->get(_gpuCurrentBlock->getNextBlock()->getNumber()))
1400
{
1401
node->setLocalIndex(_gpuNodeCount--);
1402
ir.print(" br label %%block_%d\n",
1403
_gpuCurrentBlock->getNextBlock()->getNumber());
1404
}
1405
else if (node->getOpCodeValue() == TR::BBEnd)
1406
{
1407
node->setLocalIndex(_gpuNodeCount--);
1408
}
1409
else if (node->getOpCode().isReturn())
1410
{
1411
node->setLocalIndex(_gpuNodeCount--);
1412
1413
if (node->getNumChildren() == 0)
1414
{
1415
ir.print(" %s void\n",
1416
getOpCodeName(node->getOpCodeValue()));
1417
}
1418
else
1419
{
1420
TR_ASSERT(node->getNumChildren() == 1, "Unsupported return\n");
1421
getNodeName(node->getChild(0), name0, self()->comp());
1422
ir.print(" %s %s %s\n",
1423
getOpCodeName(node->getOpCodeValue()),
1424
getTypeName(node->getDataType()),
1425
name0);
1426
}
1427
}
1428
else if (node->getOpCode().isStoreIndirect()) // TODO: handle statics
1429
{
1430
TR::Node *firstChild = node->getChild(0);
1431
TR::Node *secondChild = node->getChild(1);
1432
getNodeName(firstChild, name0, self()->comp());
1433
getNodeName(secondChild, name1, self()->comp());
1434
1435
{
1436
_gpuNodeCount--;
1437
ir.print(" %%%d = bitcast %s %s to %s %s*\n",
1438
_gpuNodeCount,
1439
getTypeName(firstChild->getDataType()), name0,
1440
getTypeName(secondChild->getDataType()),
1441
firstChild->chkSharedMemory() ? "addrspace(3)" : "");
1442
1443
ir.print(" %s %s %s, %s %s * %%%d, align %d\n",
1444
getOpCodeName(node->getOpCodeValue()),
1445
getTypeName(secondChild->getDataType()),
1446
name1,
1447
getTypeName(secondChild->getDataType()),
1448
firstChild->chkSharedMemory() ? "addrspace(3)" : "",
1449
_gpuNodeCount,
1450
secondChild->getSize());
1451
_gpuNodeCount++;
1452
}
1453
}
1454
else if (node->getOpCode().isLoadIndirect()) // TODO: handle statics
1455
{
1456
TR::Node *firstChild = node->getChild(0);
1457
1458
getNodeName(firstChild, name0, self()->comp());
1459
1460
if (node->getSymbolReference()->getCPIndex() != -1) // field of some object
1461
{
1462
// TODO: check that field is an array!
1463
TR_ASSERT(firstChild->getOpCode().isLoadDirect() &&
1464
isThisPointer(firstChild->getSymbolReference()),
1465
"can only access a field of this object\n");
1466
1467
// TODO: handle duplicate names from different classes
1468
TR_SharedMemoryField field = sharedMemory->find(self()->comp(), node->getSymbolReference());
1469
TR_ASSERT(field.getSize() >= 0, "field was not found in this object\n");
1470
1471
if (field.getSize() > 0)
1472
{
1473
ir.print(" %%%d = bitcast [%d x %s] addrspace(3)* @%.*s to i8*\n",
1474
node->getLocalIndex(),
1475
field.getSize(),
1476
getTypeNameFromSignature(field.getFieldSig(), field.getFieldSigLength()),
1477
field.getFieldNameLength(), field.getFieldName());
1478
1479
node->setSharedMemory(true);
1480
}
1481
else
1482
{
1483
int32_t parmNum = field.getParmNum();
1484
if (parmNum == -1)
1485
{
1486
sharedMemory->setParmNum(self()->comp(), node->getSymbolReference(), nextParmNum);
1487
parmNum = nextParmNum++;
1488
}
1489
ir.print(" %%%d = %s %s* %%p%d.addr, align %d\n",
1490
node->getLocalIndex(),
1491
getOpCodeName(node->getOpCodeValue()),
1492
getTypeName(node->getDataType()),
1493
parmNum,
1494
node->getSize());
1495
}
1496
}
1497
else
1498
{
1499
// assume SM35 or more
1500
static bool disableReadOnlyCacheArray = (feGetEnv("TR_disableGPUReadOnlyCacheArray") != NULL);
1501
bool isReadOnlyArray = false;
1502
if (node->getSymbolReference()->getSymbol()->isArrayShadowSymbol() &&
1503
// I disabled to generate ld.global.nc for read-only address array
1504
// I do not know an intrinsic function name for ld.global.nc of address
1505
node->getDataType() != TR::Address &&
1506
!disableReadOnlyCacheArray &&
1507
_gpuCanUseReadOnlyCache)
1508
{
1509
TR::Node *addrNode = node->getFirstChild();
1510
if (addrNode->getOpCodeValue() == TR::aiadd || addrNode->getOpCodeValue() == TR::aladd)
1511
{
1512
addrNode = addrNode->getFirstChild();
1513
}
1514
if ((addrNode->getOpCodeValue() == TR::aload) || (addrNode->getOpCodeValue() == TR::aloadi))
1515
{
1516
TR::SymbolReference *symRef = addrNode->getSymbolReference();
1517
int32_t symRefIndex = symRef->getReferenceNumber();
1518
CS2::ArrayOf<gpuMapElement, TR::Allocator> &gpuSymbolMap = self()->comp()->cg()->_gpuSymbolMap;
1519
1520
int32_t nc = symRefIndex;
1521
TR::SymbolReference *hostSymRef = gpuSymbolMap[nc]._hostSymRef;
1522
int32_t parmSlot = gpuSymbolMap[nc]._parmSlot;
1523
1524
if (!hostSymRef || parmSlot == -1)
1525
{
1526
TR::Node *tempNode = gpuSymbolMap[nc]._node;
1527
if (tempNode && (tempNode->getOpCodeValue() == TR::astore) && (tempNode->getFirstChild()->getOpCodeValue() == TR::aloadi))
1528
{
1529
TR::Node *parmNode = tempNode->getFirstChild();
1530
nc = parmNode->getSymbolReference()->getReferenceNumber();
1531
1532
hostSymRef = gpuSymbolMap[nc]._hostSymRef;
1533
parmSlot = gpuSymbolMap[nc]._parmSlot;
1534
}
1535
}
1536
else if (hostSymRef->getReferenceNumber() != symRefIndex)
1537
{
1538
hostSymRef = NULL;
1539
}
1540
1541
if (hostSymRef && (parmSlot != -1) &&
1542
(gpuSymbolMap[nc]._accessKind & TR::CodeGenerator::ReadWriteAccesses) == TR::CodeGenerator::ReadAccess)
1543
{
1544
isReadOnlyArray = true;
1545
}
1546
}
1547
}
1548
1549
ir.print(" %%%d = bitcast %s %s to %s %s*\n",
1550
_gpuNodeCount-1,
1551
getTypeName(firstChild->getDataType()),
1552
name0,
1553
getTypeName(node->getDataType()),
1554
firstChild->chkSharedMemory() ? "addrspace(3)" : isReadOnlyArray ? "addrspace(1)" : "");
1555
1556
node->setLocalIndex(_gpuNodeCount++);
1557
traceMsg(self()->comp(), "node %p assigned index %d\n", node, node->getLocalIndex());
1558
1559
//NVVM 1.3 onward uses a two parameter version of ldg
1560
if (isReadOnlyArray)
1561
{
1562
if (_gpuUseOldLdgCalls)
1563
{
1564
ir.print(" %%%d = tail call %s @llvm.nvvm.ldg.global.%s.%s.p1%s(%s addrspace(1)* %%%d), !align !1%d\n",
1565
node->getLocalIndex(),
1566
getTypeName(node->getDataType()),
1567
(node->getDataType() >= TR::Float) ? "f" : "i",
1568
getVarTypeName(node->getDataType()),
1569
getVarTypeName(node->getDataType()),
1570
getTypeName(node->getDataType()),
1571
_gpuNodeCount-2,
1572
node->getSize());
1573
}
1574
else
1575
{
1576
ir.print(" %%%d = tail call %s @llvm.nvvm.ldg.global.%s.%s.p1%s(%s addrspace(1)* %%%d, i32 %d)\n",
1577
node->getLocalIndex(),
1578
getTypeName(node->getDataType()),
1579
(node->getDataType() >= TR::Float) ? "f" : "i",
1580
getVarTypeName(node->getDataType()),
1581
getVarTypeName(node->getDataType()),
1582
getTypeName(node->getDataType()),
1583
_gpuNodeCount-2,
1584
node->getSize());
1585
}
1586
}
1587
else
1588
// e.g. %32 = load i32 addrspace(4) * %31, align 4
1589
ir.print(" %%%d = %s %s %s * %%%d, align %d\n",
1590
node->getLocalIndex(),
1591
getOpCodeName(node->getOpCodeValue()),
1592
getTypeName(node->getDataType()),
1593
firstChild->chkSharedMemory() ? "addrspace(3)" : "",
1594
_gpuNodeCount-2,
1595
node->getSize());
1596
}
1597
}
1598
else if (node->getOpCode().isCall() &&
1599
((TR::MethodSymbol*)node->getSymbolReference()->getSymbol())->getRecognizedMethod() != TR::unknownMethod &&
1600
self()->handleRecognizedMethod(node, ir, self()->comp()))
1601
{
1602
}
1603
else if (node->getOpCodeValue() == TR::arraycopy)
1604
{
1605
self()->printArrayCopyNVVMIR(node, ir, self()->comp());
1606
}
1607
else if (node->getOpCode().isCall())
1608
{
1609
traceMsg(self()->comp(), "unrecognized call %p\n", node);
1610
return GPUInvalidProgram;
1611
}
1612
else if (node->getOpCode().isStoreDirect() &&
1613
node->getSymbolReference()->getSymbol()->getRecognizedField() != TR::Symbol::UnknownField)
1614
{
1615
switch (node->getSymbolReference()->getSymbol()->getRecognizedField())
1616
{
1617
case TR::Symbol::Com_ibm_gpu_Kernel_syncThreads:
1618
ir.print(" call void @llvm.nvvm.barrier0()\n");
1619
break;
1620
default:
1621
break;
1622
}
1623
node->setLocalIndex(_gpuNodeCount--);
1624
}
1625
else if (node->getOpCode().isLoadVarDirect() &&
1626
node->getSymbolReference()->getSymbol()->getRecognizedField() != TR::Symbol::UnknownField &&
1627
self()->handleRecognizedField(node, ir))
1628
{
1629
}
1630
else if (node->getOpCode().isLoadVarDirect())
1631
{
1632
if (!node->getSymbol()->isAutoOrParm())
1633
{
1634
traceMsg(self()->comp(), "unexpected symbol in node %p\n");
1635
return GPUInvalidProgram;
1636
}
1637
1638
getAutoOrParmName(node->getSymbol(), name0);
1639
1640
ir.print(" %%%d = %s %s* %s, align %d\n",
1641
node->getLocalIndex(),
1642
getOpCodeName(node->getOpCodeValue()),
1643
getTypeName(node->getDataType()),
1644
name0,
1645
node->getSize());
1646
}
1647
else if (node->getOpCode().isStoreDirect())
1648
{
1649
if (!node->getSymbol()->isAutoOrParm())
1650
{
1651
traceMsg(self()->comp(), "unexpected symbol in node %p\n");
1652
return GPUInvalidProgram;
1653
}
1654
1655
getNodeName(node->getChild(0), name0, self()->comp());
1656
getAutoOrParmName(node->getSymbol(), name1);
1657
1658
ir.print(" %s %s %s, %s* %s, align %d\n",
1659
getOpCodeName(node->getOpCodeValue()),
1660
getTypeName(node->getChild(0)->getDataType()),
1661
name0,
1662
getTypeName(node->getDataType()),
1663
name1,
1664
node->getChild(0)->getSize());
1665
1666
node->setLocalIndex(_gpuNodeCount--);
1667
}
1668
else if (node->getOpCode().isArrayRef())
1669
{
1670
getNodeName(node->getChild(0), name0, self()->comp());
1671
getNodeName(node->getChild(1), name1, self()->comp());
1672
1673
ir.print(" %%%d = %s inbounds %s %s, %s %s\n",
1674
node->getLocalIndex(),
1675
getOpCodeName(node->getOpCodeValue()),
1676
getTypeName(node->getChild(0)->getDataType()),
1677
name0,
1678
getTypeName(node->getChild(1)->getDataType()), name1);
1679
1680
if (node->getChild(0)->chkSharedMemory())
1681
node->setSharedMemory(true);
1682
}
1683
else if (node->getOpCodeValue() == TR::arraylength)
1684
{
1685
// assume SM35 or more
1686
static bool disableReadOnlyCacheObjHdr = (feGetEnv("TR_disableGPUReadOnlyCacheObjHdr") != NULL);
1687
getNodeName(node->getChild(0), name0, self()->comp());
1688
1689
ir.print(" %%%d = getelementptr inbounds i8* %s, i32 %d\n",
1690
node->getLocalIndex(),
1691
name0,
1692
self()->objectLengthOffset());
1693
1694
node->setLocalIndex(_gpuNodeCount++);
1695
1696
ir.print(" %%%d = bitcast i8* %%%d to i32 %s*\n",
1697
node->getLocalIndex(),
1698
node->getLocalIndex() - 1,
1699
(_gpuCanUseReadOnlyCache && !disableReadOnlyCacheObjHdr) ? "addrspace(1)" : "");
1700
1701
node->setLocalIndex(_gpuNodeCount++);
1702
1703
//NVVM 1.3 onward uses a two parameter version of ldg
1704
if (_gpuCanUseReadOnlyCache && !disableReadOnlyCacheObjHdr)
1705
{
1706
if (_gpuUseOldLdgCalls)
1707
{
1708
ir.print(" %%%d = tail call i32 @llvm.nvvm.ldg.global.i.i32.p1i32(i32 addrspace(1)* %%%d), !align !14\n",
1709
node->getLocalIndex(),
1710
node->getLocalIndex() - 1);
1711
}
1712
else
1713
{
1714
ir.print(" %%%d = tail call i32 @llvm.nvvm.ldg.global.i.i32.p1i32(i32 addrspace(1)* %%%d, i32 4)\n",
1715
node->getLocalIndex(),
1716
node->getLocalIndex() - 1);
1717
}
1718
}
1719
else
1720
ir.print(" %%%d = load i32* %%%d, align 4\n",
1721
node->getLocalIndex(),
1722
node->getLocalIndex() - 1);
1723
}
1724
// Binary Operations
1725
else if ((node->getOpCodeValue() == TR::lshl ||
1726
node->getOpCodeValue() == TR::lshr) &&
1727
node->getChild(1)->getDataType() == TR::Int32)
1728
{
1729
getNodeName(node->getChild(0), name0, self()->comp());
1730
getNodeName(node->getChild(1), name1, self()->comp());
1731
1732
ir.print(" %%%d = sext i32 %s to i64\n",
1733
node->getLocalIndex(),
1734
name1);
1735
node->setLocalIndex(_gpuNodeCount++);
1736
1737
ir.print(" %%%d = %s %s %s, %%%d\n",
1738
node->getLocalIndex(),
1739
getOpCodeName(node->getOpCodeValue()),
1740
getTypeName(node->getDataType()),
1741
name0, _gpuNodeCount-2);
1742
1743
}
1744
else if (node->getOpCodeValue() == TR::imulh || node->getOpCodeValue() == TR::iumulh ||
1745
node->getOpCodeValue() == TR::lmulh || node->getOpCodeValue() == TR::lumulh)
1746
{
1747
getNodeName(node->getChild(0), name0, self()->comp());
1748
getNodeName(node->getChild(1), name1, self()->comp());
1749
1750
bool isLongMul = node->getOpCodeValue() == TR::lmulh || node->getOpCodeValue() == TR::lumulh;
1751
bool isSignedMul = node->getOpCodeValue() == TR::imulh || node->getOpCodeValue() == TR::lmulh;
1752
1753
bool extendChild0 = isLongMul || (node->getChild(0)->getDataType() != TR::Int64);
1754
bool extendChild1 = isLongMul || (node->getChild(1)->getDataType() != TR::Int64);
1755
1756
if (extendChild0)
1757
{
1758
ir.print(" %%%d = %s %s %s to %s\n",
1759
node->getLocalIndex(),
1760
isSignedMul ? "sext" : "zext",
1761
getTypeName(node->getChild(0)->getDataType()),
1762
name0,
1763
isLongMul ? "i128" : "i64");
1764
node->setLocalIndex(_gpuNodeCount++);
1765
}
1766
else
1767
{
1768
ir.print(" %%%d = lshr i64 %s, 0\n",
1769
node->getLocalIndex(),
1770
name0);
1771
node->setLocalIndex(_gpuNodeCount++);
1772
}
1773
1774
if(extendChild1)
1775
{
1776
ir.print(" %%%d = %s %s %s to %s\n",
1777
node->getLocalIndex(),
1778
isSignedMul ? "sext" : "zext",
1779
getTypeName(node->getChild(1)->getDataType()),
1780
name1,
1781
isLongMul ? "i128" : "i64");
1782
node->setLocalIndex(_gpuNodeCount++);
1783
}
1784
else
1785
{
1786
ir.print(" %%%d = lshr i64 %s, 0\n",
1787
node->getLocalIndex(),
1788
name1);
1789
node->setLocalIndex(_gpuNodeCount++);
1790
}
1791
1792
ir.print(" %%%d = mul %s %%%d, %%%d\n",
1793
node->getLocalIndex(),
1794
isLongMul ? "i128" : "i64",
1795
node->getLocalIndex()-2,
1796
node->getLocalIndex()-1);
1797
node->setLocalIndex(_gpuNodeCount++);
1798
1799
ir.print(" %%%d = lshr %s %%%d, %s\n",
1800
node->getLocalIndex(),
1801
isLongMul ? "i128" : "i64",
1802
node->getLocalIndex()-1,
1803
isLongMul ? "64" : "32");
1804
node->setLocalIndex(_gpuNodeCount++);
1805
1806
ir.print(" %%%d = trunc %s %%%d to %s\n",
1807
node->getLocalIndex(),
1808
isLongMul ? "i128" : "i64",
1809
node->getLocalIndex()-1,
1810
isLongMul ? "i64" : "i32");
1811
}
1812
else if (node->getOpCodeValue() == TR::bneg || node->getOpCodeValue() == TR::sneg ||
1813
node->getOpCodeValue() == TR::ineg || node->getOpCodeValue() == TR::lneg ||
1814
node->getOpCodeValue() == TR::fneg || node->getOpCodeValue() == TR::dneg)
1815
{
1816
getNodeName(node->getChild(0), name0, self()->comp());
1817
1818
bool isFloatDouble = node->getOpCodeValue() == TR::fneg || node->getOpCodeValue() == TR::dneg;
1819
1820
ir.print(" %%%d = %s %s %s, %s\n",
1821
node->getLocalIndex(),
1822
getOpCodeName(node->getOpCodeValue()),
1823
getTypeName(node->getDataType()),
1824
isFloatDouble ? "0.0" : "0",
1825
name0);
1826
}
1827
else if (node->getOpCodeValue() == TR::iabs || node->getOpCodeValue() == TR::labs)
1828
{
1829
getNodeName(node->getChild(0), name0, self()->comp());
1830
1831
bool isInt = node->getOpCodeValue() == TR::iabs;
1832
1833
ir.print(" %%%d = ashr %s %s, %s\n",
1834
node->getLocalIndex(),
1835
getTypeName(node->getDataType()),
1836
name0,
1837
isInt ? "31" : "63");
1838
node->setLocalIndex(_gpuNodeCount++);
1839
1840
ir.print(" %%%d = xor %s %s, %%%d\n",
1841
node->getLocalIndex(),
1842
getTypeName(node->getDataType()),
1843
name0,
1844
node->getLocalIndex()-1);
1845
node->setLocalIndex(_gpuNodeCount++);
1846
1847
ir.print(" %%%d = sub %s %%%d, %%%d\n",
1848
node->getLocalIndex(),
1849
getTypeName(node->getDataType()),
1850
node->getLocalIndex()-1,
1851
node->getLocalIndex()-2);
1852
}
1853
else if (node->getOpCodeValue() == TR::irol || node->getOpCodeValue() == TR::lrol)
1854
{
1855
getNodeName(node->getChild(0), name0, self()->comp());
1856
getNodeName(node->getChild(1), name1, self()->comp());
1857
1858
bool isInt = node->getOpCodeValue() == TR::irol;
1859
1860
ir.print(" %%%d = shl %s %s, %s\n",
1861
node->getLocalIndex(),
1862
getTypeName(node->getDataType()),
1863
name0, name1);
1864
node->setLocalIndex(_gpuNodeCount++);
1865
1866
ir.print(" %%%d = sub %s %s, %s\n",
1867
node->getLocalIndex(),
1868
getTypeName(node->getChild(1)->getDataType()),
1869
isInt ? "32" : "64",
1870
name1);
1871
node->setLocalIndex(_gpuNodeCount++);
1872
1873
ir.print(" %%%d = and %s %%%d, %s\n",
1874
node->getLocalIndex(),
1875
getTypeName(node->getChild(1)->getDataType()),
1876
node->getLocalIndex()-1,
1877
isInt ? "31" : "63");
1878
node->setLocalIndex(_gpuNodeCount++);
1879
1880
ir.print(" %%%d = lshr %s %s, %%%d\n",
1881
node->getLocalIndex(),
1882
getTypeName(node->getDataType()),
1883
name0,
1884
node->getLocalIndex()-1);
1885
node->setLocalIndex(_gpuNodeCount++);
1886
1887
ir.print(" %%%d = or %s %%%d, %%%d\n",
1888
node->getLocalIndex(),
1889
getTypeName(node->getDataType()),
1890
node->getLocalIndex()-4,
1891
node->getLocalIndex()-1);
1892
}
1893
else if (node->getOpCodeValue() == TR::ibits2f || node->getOpCodeValue() == TR::fbits2i ||
1894
node->getOpCodeValue() == TR::lbits2d || node->getOpCodeValue() == TR::dbits2l)
1895
{
1896
getNodeName(node->getChild(0), name0, self()->comp());
1897
1898
ir.print(" %%%d = %s %s %s to %s\n",
1899
node->getLocalIndex(),
1900
getOpCodeName(node->getOpCodeValue()),
1901
getTypeName(node->getChild(0)->getDataType()),
1902
name0,
1903
getTypeName(node->getDataType()));
1904
}
1905
else if (node->getOpCode().isArithmetic() &&
1906
node->getNumChildren() == 2 &&
1907
getOpCodeName(node->getOpCodeValue())) // supported binary opcode
1908
{
1909
getNodeName(node->getChild(0), name0, self()->comp());
1910
getNodeName(node->getChild(1), name1, self()->comp());
1911
1912
ir.print(" %%%d = %s %s %s, %s\n",
1913
node->getLocalIndex(),
1914
getOpCodeName(node->getOpCodeValue()),
1915
getTypeName(node->getDataType()),
1916
name0, name1);
1917
}
1918
else if (node->getOpCode().isConversion() &&
1919
getOpCodeName(node->getOpCodeValue()))
1920
{
1921
getNodeName(node->getChild(0), name0, self()->comp());
1922
1923
ir.print(" %%%d = %s %s %s to %s\n",
1924
node->getLocalIndex(),
1925
getOpCodeName(node->getOpCodeValue()),
1926
getTypeName(node->getChild(0)->getDataType()),
1927
name0,
1928
getTypeName(node->getDataType()));
1929
}
1930
else if (node->getOpCode().isIf())
1931
{
1932
getNodeName(node->getChild(0), name0, self()->comp());
1933
getNodeName(node->getChild(1), name1, self()->comp());
1934
const char *type0 = getTypeName(node->getChild(0)->getDataType());
1935
1936
const char *opcode = getOpCodeName(node->getOpCodeValue());
1937
1938
ir.print(" %%%d = %s %s %s, %s\n",
1939
node->getLocalIndex(), opcode, type0, name0, name1);
1940
ir.print(" br i1 %%%d, label %%block_%d, label %%block_%d\n",
1941
node->getLocalIndex(),
1942
node->getBranchDestination()->getNode()->getBlock()->getNumber(),
1943
_gpuCurrentBlock->getNextBlock()->getNumber());
1944
}
1945
else if (node->getOpCodeValue() == TR::Goto)
1946
{
1947
ir.print(" %s label %%block_%d\n",
1948
getOpCodeName(node->getOpCodeValue()),
1949
node->getBranchDestination()->getNode()->getBlock()->getNumber());
1950
node->setLocalIndex(_gpuNodeCount--);
1951
}
1952
else if (node->getOpCodeValue() == TR::lookup)
1953
{
1954
getNodeName(node->getChild(0), name0, self()->comp());
1955
1956
ir.print(" %s %s %s, label %%block_%d [ ",
1957
getOpCodeName(node->getOpCodeValue()),
1958
getTypeName(node->getChild(0)->getDataType()),
1959
name0,
1960
node->getChild(1)->getBranchDestination()->getNode()->getBlock()->getNumber()
1961
);
1962
for(int i=2; i < node->getNumChildren(); ++i)
1963
{
1964
ir.print("%s %d, label %%block_%d ",
1965
getTypeName(node->getChild(0)->getDataType()),
1966
node->getChild(i)->getCaseConstant(),
1967
node->getChild(i)->getBranchDestination()->getNode()->getBlock()->getNumber());
1968
}
1969
ir.print("]\n");
1970
node->setLocalIndex(_gpuNodeCount--);
1971
}
1972
else if (node->getOpCodeValue() == TR::table)
1973
{
1974
getNodeName(node->getChild(0), name0, self()->comp());
1975
1976
ir.print(" %s %s %s, label %%block_%d [ ",
1977
getOpCodeName(node->getOpCodeValue()),
1978
getTypeName(node->getChild(0)->getDataType()),
1979
name0,
1980
node->getChild(1)->getBranchDestination()->getNode()->getBlock()->getNumber()
1981
);
1982
for(int i=2; i < node->getNumChildren(); ++i)
1983
{
1984
ir.print("%s %d, label %%block_%d ",
1985
getTypeName(node->getChild(0)->getDataType()),
1986
i-2,
1987
node->getChild(i)->getBranchDestination()->getNode()->getBlock()->getNumber());
1988
}
1989
ir.print("]\n");
1990
node->setLocalIndex(_gpuNodeCount--);
1991
}
1992
else if (node->getOpCode().isBooleanCompare()) //Needs to be after "isIf()" check
1993
{
1994
getNodeName(node->getChild(0), name0, self()->comp());
1995
getNodeName(node->getChild(1), name1, self()->comp());
1996
const char *type0 = getTypeName(node->getChild(0)->getDataType());
1997
1998
const char *opcode = getOpCodeName(node->getOpCodeValue());
1999
2000
ir.print(" %%%d = %s %s %s, %s\n",
2001
node->getLocalIndex(), opcode, type0, name0, name1);
2002
node->setLocalIndex(_gpuNodeCount++);
2003
2004
ir.print(" %%%d = zext i1 %%%d to i32\n",
2005
node->getLocalIndex(),
2006
node->getLocalIndex()-1);
2007
}
2008
else if (node->getOpCodeValue() == TR::treetop || node->getOpCodeValue() == TR::Case)
2009
{
2010
node->setLocalIndex(_gpuNodeCount--);
2011
}
2012
else if (getOpCodeName(node->getOpCodeValue()) &&
2013
strcmp(getOpCodeName(node->getOpCodeValue()), "INVALID") == 0)
2014
{
2015
node->setLocalIndex(_gpuNodeCount--);
2016
traceMsg(self()->comp(), "INVALID operation required by node %p\n", node);
2017
return GPUInvalidProgram;
2018
}
2019
else
2020
{
2021
node->setLocalIndex(_gpuNodeCount--);
2022
traceMsg(self()->comp(), "node %p assigned index %d\n", node, node->getLocalIndex());
2023
traceMsg(self()->comp(), "unsupported opcode (%s) on line %d %p\n", node->getOpCode().getName(), self()->comp()->getLineNumber(node), node);
2024
return GPUInvalidProgram;
2025
}
2026
2027
return GPUSuccess;
2028
}
2029
2030
2031
void traceNVVMIR(TR::Compilation *comp, char *buffer)
2032
{
2033
traceMsg(comp, "NVVM IR:\n");
2034
char msg[256];
2035
char *cs = buffer;
2036
int line = 1;
2037
while (*cs != '\0')
2038
{
2039
char *ce = cs;
2040
while (*ce != '\n' && *ce != '\0')
2041
{
2042
ce++;
2043
}
2044
ce++;
2045
int len = (ce - cs) < 255 ? (ce - cs) : 255;
2046
memcpy(msg, cs, len);
2047
msg[len] = '\0';
2048
traceMsg(comp, "%6d: %s", line++, msg);
2049
if (*(ce - 1) == '\0')
2050
{
2051
ce--;
2052
}
2053
cs = ce;
2054
}
2055
traceMsg(comp, "\n");
2056
}
2057
2058
2059
void
2060
J9::CodeGenerator::findExtraParms(
2061
TR::Node *node,
2062
int32_t &numExtraParms,
2063
TR_SharedMemoryAnnotations *sharedMemory,
2064
vcount_t visitCount)
2065
{
2066
if (node->getVisitCount() == visitCount)
2067
return;
2068
2069
node->setVisitCount(visitCount);
2070
2071
if (node->getOpCode().isLoadIndirect() &&
2072
_gpuSymbolMap[node->getSymbolReference()->getReferenceNumber()]._parmSlot == -1)
2073
{
2074
TR::Node *firstChild = node->getChild(0);
2075
if (node->getSymbolReference()->getCPIndex() != -1) // field of some object
2076
{
2077
// TODO: check that field is an array!
2078
TR_ASSERT(firstChild->getOpCode().isLoadDirect() &&
2079
isThisPointer(firstChild->getSymbolReference()),
2080
"can only access a field of this object\n");
2081
2082
// TODO: handle duplicate names from different classes
2083
TR_SharedMemoryField field = sharedMemory->find(TR::comp(), node->getSymbolReference());
2084
2085
if (field.getSize() == 0)
2086
numExtraParms++;
2087
}
2088
}
2089
2090
for (int32_t i = 0; i < node->getNumChildren(); ++i)
2091
{
2092
TR::Node *child = node->getChild(i);
2093
self()->findExtraParms(child, numExtraParms, sharedMemory, visitCount);
2094
}
2095
}
2096
2097
2098
void
2099
J9::CodeGenerator::dumpInvariant(
2100
CS2::ArrayOf<gpuParameter, TR::Allocator>::Cursor pit,
2101
NVVMIRBuffer &ir,
2102
bool isbufferalign)
2103
{
2104
return;
2105
2106
for (pit.SetToFirst(); pit.Valid(); pit.SetToNext())
2107
{
2108
TR::Symbol *sym = pit->_hostSymRef->getSymbol();
2109
char parmName[MAX_NAME+2];
2110
getParmName(pit->_parmSlot, parmName, false);
2111
if (sym->getDataType() == TR::Address)
2112
{
2113
if (isbufferalign)
2114
strcat(parmName, ".t");
2115
ir.print(" call void @llvm.invariant.end({}* %%inv_%s_header, i64 %d, i8* %s)\n",
2116
&parmName[1], self()->objectHeaderInvariant(), parmName);
2117
}
2118
}
2119
}
2120
2121
#ifdef ENABLE_GPU
2122
bool calculateComputeCapability(int tracing, short* computeMajor, short* computeMinor, int deviceId);
2123
bool getNvvmVersion(int tracing, int* majorVersion, int* minorVersion);
2124
#endif
2125
2126
TR::CodeGenerator::GPUResult
2127
J9::CodeGenerator::dumpNVVMIR(
2128
TR::TreeTop *firstTreeTop,
2129
TR::TreeTop *lastTreeTop,
2130
TR_RegionStructure *loop,
2131
SharedSparseBitVector *blocksInLoop,
2132
ListBase<TR::AutomaticSymbol> *autos,
2133
ListBase<TR::ParameterSymbol> *parms,
2134
bool staticMethod,
2135
char * &nvvmIR,
2136
TR::Node * &errorNode,
2137
int gpuPtxCount,
2138
bool* hasExceptionChecks)
2139
{
2140
static bool isbufferalign = feGetEnv("TR_disableGPUBufferAlign") ? false : true;
2141
NVVMIRBuffer ir(self()->comp()->trMemory());
2142
GPUResult result;
2143
short computeMajor, computeMinor, computeCapability;
2144
int nvvmMajorVersion = 0;
2145
int nvvmMinorVersion = 0;
2146
2147
_gpuHasNullCheck = false;
2148
_gpuHasBndCheck = false;
2149
_gpuHasDivCheck = false;
2150
_gpuNodeCount = 0;
2151
_gpuReturnType = TR::NoType;
2152
_gpuPostDominators = NULL;
2153
_gpuStartBlock = NULL;
2154
_gpuNeedNullCheckArguments_vector = 0;
2155
_gpuCanUseReadOnlyCache = false;
2156
_gpuUseOldLdgCalls = false;
2157
2158
#ifdef ENABLE_GPU
2159
if (!calculateComputeCapability(/*tracing*/0, &computeMajor, &computeMinor, /*deviceId*/0))
2160
{
2161
traceMsg(self()->comp(), "calculateComputeCapability was unsuccessful.\n");
2162
return GPUHelperError;
2163
}
2164
computeCapability = 100*computeMajor + computeMinor; //combines Major and Minor versions into a single number.
2165
2166
if (computeCapability >= 305) //If compute capability is 3.5 or higher
2167
_gpuCanUseReadOnlyCache = true; //then the GPU is capable of using read only cache
2168
2169
if (!getNvvmVersion(/*tracing*/0, &nvvmMajorVersion, &nvvmMinorVersion))
2170
{
2171
traceMsg(self()->comp(), "getNvvmVersion was unsuccessful.\n");
2172
return GPUHelperError;
2173
}
2174
2175
/*
2176
* NVVM 1.3 updates LLVM support to LLVM 3.8. From LLVM 3.6 onward, ldg was changed to make alignment an explicit
2177
* parameter instead of as metadata. As a result, NVVM 1.2 and before uses a one parameter version of ldg while
2178
* NVVM 1.3 and onward uses a two parameter version.
2179
*/
2180
if (nvvmMajorVersion == 1 && nvvmMinorVersion <= 2)
2181
{
2182
_gpuUseOldLdgCalls = true;
2183
}
2184
#endif
2185
2186
TR::CFG *cfg = self()->comp()->getFlowGraph();
2187
TR_BitVector targetBlocks(cfg->getNumberOfNodes(), self()->comp()->trMemory(), stackAlloc, growable);
2188
2189
static bool enableExceptionCheckElimination = (feGetEnv("TR_enableGPUExceptionCheckElimination") != NULL);
2190
if (enableExceptionCheckElimination)
2191
{
2192
_gpuPostDominators = new (self()->comp()->trStackMemory()) TR_Dominators(self()->comp(), true);
2193
}
2194
_gpuStartBlock = toBlock(cfg->getStart());
2195
2196
2197
TR_SharedMemoryAnnotations sharedMemory(self()->comp());
2198
int32_t numExtraParms = 0;
2199
2200
// First pass through the trees
2201
vcount_t visitCount = self()->comp()->incVisitCount();
2202
2203
int32_t currentBlock = 0;
2204
int32_t firstBlock = 0;
2205
for (TR::TreeTop * tree = firstTreeTop; tree != lastTreeTop; tree = tree->getNextTreeTop())
2206
{
2207
if (tree->getNode()->getOpCodeValue() == TR::BBStart)
2208
currentBlock = tree->getNode()->getBlock()->getNumber();
2209
2210
if (blocksInLoop && !blocksInLoop->ValueAt(currentBlock))
2211
continue;
2212
2213
if (firstBlock == 0)
2214
firstBlock = currentBlock;
2215
2216
TR::Node *node = tree->getNode();
2217
if (node->getOpCode().isBranch())
2218
{
2219
TR_ASSERT(node->getBranchDestination()->getNode()->getOpCodeValue() == TR::BBStart, "Attempted to get Block number of a non-BBStart node.");
2220
targetBlocks.set(node->getBranchDestination()->getNode()->getBlock()->getNumber());
2221
2222
if (tree->getNextTreeTop() &&
2223
tree->getNextTreeTop()->getNextTreeTop())
2224
{
2225
node = tree->getNextTreeTop()->getNextTreeTop()->getNode();
2226
TR_ASSERT(node->getOpCodeValue() == TR::BBStart, "Attempted to get Block number of a non-BBStart node.");
2227
targetBlocks.set(node->getBlock()->getNumber());
2228
}
2229
}
2230
else if (node->getOpCode().isSwitch())
2231
{
2232
for (int childIndex = 0; childIndex < node->getNumChildren(); ++childIndex)
2233
{
2234
if (node->getChild(childIndex)->getOpCode().isBranch())
2235
{
2236
TR_ASSERT(node->getChild(childIndex)->getBranchDestination()->getNode()->getOpCodeValue() == TR::BBStart, "Attempted to get Block number of a non-BBStart node.");
2237
targetBlocks.set(node->getChild(childIndex)->getBranchDestination()->getNode()->getBlock()->getNumber());
2238
}
2239
}
2240
}
2241
else if (node->getOpCode().isReturn())
2242
_gpuReturnType = node->getDataType().getDataType();
2243
2244
//findExtraParms(node, numExtraParms, &sharedMemory, visitCount);
2245
}
2246
2247
traceMsg(self()->comp(), "extra parameters = %d\n", numExtraParms);
2248
ir.print("target triple = \"nvptx64-unknown-cuda\"\n");
2249
ir.print("target datalayout = \"e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64\"\n\n"); // TODO: 32-bit
2250
2251
2252
// TODO: alignment, arraylength !!!
2253
for(auto lit = sharedMemory.getSharedMemoryFields().begin(); lit != sharedMemory.getSharedMemoryFields().end(); ++lit)
2254
{
2255
if ((*lit).getSize() > 0)
2256
ir.print("@%.*s = internal addrspace(3) global [%d x %s] zeroinitializer, align 8\n",
2257
(*lit).getFieldNameLength(), (*lit).getFieldName(), (*lit).getSize(),
2258
getTypeNameFromSignature((*lit).getFieldSig(), (*lit).getFieldSigLength()));
2259
}
2260
2261
static bool disableReadOnlyCacheArray = (feGetEnv("TR_disableGPUReadOnlyCacheArray") != NULL);
2262
static bool disableReadOnlyCacheObjHdr = (feGetEnv("TR_disableGPUReadOnlyCacheObjHdr") != NULL);
2263
2264
//ir.print("@_ExceptionKind = addrspace(1) global [1 x i32 0, align 4\n");
2265
ir.print("@_ExceptionKind = addrspace(1) global [1 x i32] zeroinitializer, align 4\n");
2266
2267
//NVVM 1.3 onward uses a two parameter version of ldg
2268
if (_gpuCanUseReadOnlyCache && (!disableReadOnlyCacheArray || !disableReadOnlyCacheObjHdr))
2269
{
2270
if (_gpuUseOldLdgCalls)
2271
{
2272
ir.print("declare i8 @llvm.nvvm.ldg.global.i.i8.p1i8(i8 addrspace(1)* %%ptr)\n");
2273
ir.print("declare i16 @llvm.nvvm.ldg.global.i.i16.p1i16(i16 addrspace(1)* %%ptr)\n");
2274
ir.print("declare i32 @llvm.nvvm.ldg.global.i.i32.p1i32(i32 addrspace(1)* %%ptr)\n");
2275
ir.print("declare i64 @llvm.nvvm.ldg.global.i.i64.p1i64(i64 addrspace(1)* %%ptr)\n");
2276
ir.print("declare float @llvm.nvvm.ldg.global.f.f32.p1f32(float addrspace(1)* %%ptr)\n");
2277
ir.print("declare double @llvm.nvvm.ldg.global.f.f64.p1f64(double addrspace(1)* %%ptr)\n");
2278
ir.print("declare i8* @llvm.nvvm.ldg.global.p.p64.p1p64(i8* addrspace(1)* %%ptr)\n");
2279
}
2280
else
2281
{
2282
ir.print("declare i8 @llvm.nvvm.ldg.global.i.i8.p1i8(i8 addrspace(1)* %%ptr, i32 %%align)\n");
2283
ir.print("declare i16 @llvm.nvvm.ldg.global.i.i16.p1i16(i16 addrspace(1)* %%ptr, i32 %%align)\n");
2284
ir.print("declare i32 @llvm.nvvm.ldg.global.i.i32.p1i32(i32 addrspace(1)* %%ptr, i32 %%align)\n");
2285
ir.print("declare i64 @llvm.nvvm.ldg.global.i.i64.p1i64(i64 addrspace(1)* %%ptr, i32 %%align)\n");
2286
ir.print("declare float @llvm.nvvm.ldg.global.f.f32.p1f32(float addrspace(1)* %%ptr, i32 %%align)\n");
2287
ir.print("declare double @llvm.nvvm.ldg.global.f.f64.p1f64(double addrspace(1)* %%ptr, i32 %%align)\n");
2288
ir.print("declare i8* @llvm.nvvm.ldg.global.p.p64.p1p64(i8* addrspace(1)* %%ptr, i32 %%align)\n");
2289
}
2290
}
2291
2292
ir.print("declare {}* @llvm.invariant.start(i64 %%size, i8* nocapture %%ptr)\n");
2293
ir.print("declare void @llvm.invariant.end({}* %%start, i64 %%size, i8* nocapture %%ptr)\n");
2294
2295
ir.print("\ndefine %s @test%d(", getTypeName(_gpuReturnType), gpuPtxCount);
2296
2297
CS2::ArrayOf<gpuParameter, TR::Allocator> gpuParameterMap(TR::comp()->allocator());
2298
CS2::ArrayOf<TR::CodeGenerator::gpuMapElement, TR::Allocator>::Cursor ait(_gpuSymbolMap);
2299
2300
for (ait.SetToFirst(); ait.Valid(); ait.SetToNext())
2301
{
2302
if (!ait->_hostSymRef) continue;
2303
traceMsg(TR::comp(), "hostSymRef #%d parmSlot %d\n", (int)ait, ait->_parmSlot);
2304
2305
if (ait->_parmSlot != -1)
2306
{
2307
gpuParameter parm (ait->_hostSymRef, ait->_parmSlot);
2308
gpuParameterMap[ait->_parmSlot] = parm;
2309
}
2310
}
2311
2312
2313
TR::ResolvedMethodSymbol *method = self()->comp()->getJittedMethodSymbol();
2314
ListIterator<TR::ParameterSymbol> pi(parms);
2315
TR::ParameterSymbol *parm;
2316
2317
bool first = true;
2318
int32_t nextParmNum = staticMethod ? 0 : 1;
2319
2320
parm = pi.getFirst();
2321
if (!staticMethod) parm = pi.getNext();
2322
2323
char name[MAX_NAME];
2324
2325
for (; parm; parm = pi.getNext())
2326
{
2327
getAutoOrParmName(parm, name, false);
2328
2329
if (!first) ir.print(", ");
2330
ir.print("%s %s", getTypeName(parm->getDataType()), name);
2331
first = false;
2332
nextParmNum++;
2333
}
2334
2335
2336
CS2::ArrayOf<gpuParameter, TR::Allocator>::Cursor pit(gpuParameterMap);
2337
for (pit.SetToFirst(); pit.Valid(); pit.SetToNext())
2338
{
2339
getParmName(pit->_parmSlot, name, false);
2340
2341
if (!first) ir.print(", ");
2342
ir.print("%s %s", getTypeName(pit->_hostSymRef->getSymbol()->getDataType()), name);
2343
first = false;
2344
nextParmNum++;
2345
}
2346
2347
int numParms = nextParmNum - (staticMethod ? 0 : 1);
2348
2349
ir.print("%s%s%s %%ExceptionKind",
2350
numParms > 0 ? ", " : "",
2351
self()->comp()->isGPUCompilation() ? "" : "i32 %startInclusive, i32 %endExclusive, ",
2352
getTypeName(TR::Address));
2353
2354
ir.print(") {\n");
2355
ir.print("entry:\n");
2356
2357
pi.reset();
2358
parm = pi.getFirst();
2359
if (!staticMethod) parm = pi.getNext();
2360
2361
first = true;
2362
for (; parm; parm = pi.getNext())
2363
{
2364
char name[MAX_NAME];
2365
getAutoOrParmName(parm, name);
2366
ir.print(" %s = alloca %s, align %d\n",
2367
name,
2368
getTypeName(parm->getDataType()),
2369
parm->getSize());
2370
2371
char origName[MAX_NAME];
2372
getAutoOrParmName(parm, origName, false);
2373
ir.print(" store %s %s, %s* %s, align %d\n",
2374
getTypeName(parm->getDataType()),
2375
origName,
2376
getTypeName(parm->getDataType()),
2377
name,
2378
parm->getSize());
2379
}
2380
2381
2382
for (pit.SetToFirst(); pit.Valid(); pit.SetToNext())
2383
{
2384
TR::Symbol *sym = pit->_hostSymRef->getSymbol();
2385
char addrName[MAX_NAME+2];
2386
getParmName(pit->_parmSlot, addrName);
2387
ir.print(" %s = alloca %s, align %d\n",
2388
addrName,
2389
getTypeName(sym->getDataType()),
2390
sym->getSize());
2391
2392
char parmName[MAX_NAME];
2393
getParmName(pit->_parmSlot, parmName, false);
2394
if (sym->getDataType() == TR::Address)
2395
{
2396
if (isbufferalign)
2397
{
2398
char name[MAX_NAME];
2399
strcpy(name, parmName);
2400
strcat(parmName, ".t");
2401
ir.print(" %s = getelementptr inbounds i8* %s, i32 %d\n",
2402
parmName,
2403
name,
2404
GPUAlignment - TR::Compiler->om.contiguousArrayHeaderSizeInBytes());
2405
}
2406
ir.print(" %%inv_%s_header = call {}* @llvm.invariant.start(i64 %d, i8* %s)\n",
2407
&parmName[1], self()->objectHeaderInvariant(), parmName);
2408
}
2409
ir.print(" store %s %s, %s* %s, align %d\n",
2410
getTypeName(sym->getDataType()),
2411
parmName,
2412
getTypeName(sym->getDataType()),
2413
addrName,
2414
sym->getSize());
2415
2416
}
2417
2418
2419
ListIterator<TR::AutomaticSymbol> ai(autos);
2420
uint16_t liveLocalIndex = 0;
2421
for (TR::AutomaticSymbol *a = ai.getFirst(); a != NULL; a = ai.getNext())
2422
{
2423
ir.print(" %%a%d.addr = alloca %s, align %d\n",
2424
liveLocalIndex,
2425
getTypeName(a->getDataType()),
2426
a->getSize());
2427
a->setLiveLocalIndex(liveLocalIndex++, 0);
2428
}
2429
2430
2431
if (!self()->comp()->isGPUCompilation())
2432
{
2433
ir.print(" %%0 = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x()\n");
2434
ir.print(" %%1 = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()\n");
2435
ir.print(" %%2 = mul i32 %%0, %%1\n");
2436
ir.print(" %%3 = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()\n");
2437
ir.print(" %%4 = add i32 %%2, %%3\n");
2438
ir.print(" %%5 = add i32 %%startInclusive, %%4\n");
2439
ir.print(" store i32 %%5, i32* %%%s.addr, align 4\n", "a0");
2440
ir.print(" %%6 = icmp slt i32 %%5, %%endExclusive\n");
2441
ir.print(" br i1 %%6, label %%block_%d, label %%block_0\n", firstBlock);
2442
ir.print("block_0:\n");
2443
self()->dumpInvariant(pit, ir, isbufferalign);
2444
ir.print(" ret void\n");
2445
2446
_gpuNodeCount = 7;
2447
}
2448
2449
// print all trees
2450
visitCount = self()->comp()->incVisitCount();
2451
2452
for (TR::TreeTop * tree = firstTreeTop; tree != lastTreeTop; tree = tree->getNextTreeTop())
2453
{
2454
TR::Node *node = tree->getNode();
2455
2456
if (node->getOpCodeValue() == TR::BBStart)
2457
currentBlock = node->getBlock()->getNumber();
2458
2459
if (blocksInLoop && !blocksInLoop->ValueAt(currentBlock))
2460
continue;
2461
2462
// don't print the backedge
2463
if (node->getOpCode().isBranch() &&
2464
node->getBranchDestination()->getNode()->getBlock()->getNumber() == firstBlock)
2465
{
2466
self()->dumpInvariant(pit, ir, isbufferalign);
2467
ir.print(" ret void\n");
2468
continue;
2469
}
2470
2471
result = self()->printNVVMIR(ir, tree->getNode(), loop, &targetBlocks, visitCount, &sharedMemory, nextParmNum, errorNode);
2472
if (result != GPUSuccess)
2473
{
2474
return result;
2475
}
2476
}
2477
2478
if (_gpuReturnType == TR::NoType)
2479
{
2480
self()->dumpInvariant(pit, ir, isbufferalign);
2481
ir.print(" ret void\n");
2482
}
2483
2484
_gpuNodeCount++;
2485
2486
if (_gpuNeedNullCheckArguments_vector != 0)
2487
{
2488
ir.print("; needNullCheckArguments_vector=");
2489
int32_t len = sizeof(uint64_t) * CHAR_BIT;
2490
for (int32_t i = len - 1; i >= 0; i--)
2491
{
2492
ir.print("%u", (_gpuNeedNullCheckArguments_vector >> (uint64_t)i) & 1);
2493
}
2494
ir.print("\n");
2495
}
2496
2497
if (_gpuHasNullCheck)
2498
{
2499
ir.print("NullException:\n");
2500
//ir.print(" store i32 1, i32 addrspace(1)* @_ExceptionKind, align 4\n");
2501
ir.print(" %%%d = bitcast i8* %%ExceptionKind to i32*\n", _gpuNodeCount);
2502
ir.print(" store i32 %d, i32 * %%%d, align 4\n", GPUNullCheck, _gpuNodeCount++);
2503
self()->dumpInvariant(pit, ir, isbufferalign);
2504
ir.print(" ret void\n");
2505
}
2506
if (_gpuHasBndCheck)
2507
{
2508
ir.print("BndException:\n");
2509
//ir.print(" store i32 2, i32 addrspace(1)* @_ExceptionKind, align 4\n");
2510
ir.print(" %%%d = bitcast i8* %%ExceptionKind to i32*\n", _gpuNodeCount);
2511
ir.print(" store i32 %d, i32 * %%%d, align 4\n", GPUBndCheck, _gpuNodeCount++);
2512
self()->dumpInvariant(pit, ir, isbufferalign);
2513
ir.print(" ret void\n");
2514
}
2515
if (_gpuHasDivCheck)
2516
{
2517
ir.print("DivException:\n");
2518
//ir.print(" store i32 3, i32 addrspace(1)* @_ExceptionKind, align 4\n");
2519
ir.print(" %%%d = bitcast i8* %%ExceptionKind to i32*\n", _gpuNodeCount);
2520
ir.print(" store i32 %d, i32 * %%%d, align 4\n", GPUDivException, _gpuNodeCount++);
2521
self()->dumpInvariant(pit, ir, isbufferalign);
2522
ir.print(" ret void\n");
2523
}
2524
2525
ir.print("}\n");
2526
2527
ir.print("declare i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() nounwind readnone\n");
2528
ir.print("declare i32 @llvm.nvvm.read.ptx.sreg.ctaid.y() nounwind readnone\n");
2529
ir.print("declare i32 @llvm.nvvm.read.ptx.sreg.ctaid.z() nounwind readnone\n");
2530
2531
ir.print("declare i32 @llvm.nvvm.read.ptx.sreg.ntid.x() nounwind readnone\n");
2532
ir.print("declare i32 @llvm.nvvm.read.ptx.sreg.ntid.y() nounwind readnone\n");
2533
ir.print("declare i32 @llvm.nvvm.read.ptx.sreg.ntid.z() nounwind readnone\n");
2534
2535
ir.print("declare i32 @llvm.nvvm.read.ptx.sreg.tid.x() nounwind readnone\n");
2536
ir.print("declare i32 @llvm.nvvm.read.ptx.sreg.tid.y() nounwind readnone\n");
2537
ir.print("declare i32 @llvm.nvvm.read.ptx.sreg.tid.z() nounwind readnone\n");
2538
2539
if (self()->comp()->getOptions()->getEnableGPU(TR_EnableGPUEnableMath))
2540
{
2541
ir.print("declare double @__nv_sin(double)\n");
2542
ir.print("declare double @__nv_cos(double)\n");
2543
ir.print("declare double @__nv_sqrt(double)\n");
2544
ir.print("declare double @__nv_log(double)\n");
2545
ir.print("declare double @__nv_exp(double)\n");
2546
ir.print("declare double @__nv_fabs(double)\n");
2547
ir.print("declare float @__nv_fabsf(float)\n");
2548
}
2549
2550
ir.print("declare void @llvm.nvvm.barrier0() nounwind readnone\n");
2551
2552
ir.print("!10 = metadata !{i32 0}\n");
2553
ir.print("!11 = metadata !{i32 1}\n");
2554
ir.print("!12 = metadata !{i32 2}\n");
2555
ir.print("!14 = metadata !{i32 4}\n");
2556
ir.print("!18 = metadata !{i32 8}\n");
2557
2558
ir.print("!nvvmir.version = !{!0}\n");
2559
ir.print("!0 = metadata !{i32 1, i32 0}\n");
2560
2561
ir.print("!nvvm.annotations = !{!1}\n");
2562
ir.print("!1 = metadata !{%s (", getTypeName(_gpuReturnType));
2563
pi.reset();
2564
parm = pi.getFirst();
2565
if (!staticMethod) parm = pi.getNext();
2566
2567
first = true;
2568
for (; parm; parm = pi.getNext())
2569
{
2570
if (!first) ir.print(", ");
2571
ir.print("%s", getTypeName(parm->getDataType()));
2572
first = false;
2573
}
2574
2575
for (pit.SetToFirst(); pit.Valid(); pit.SetToNext())
2576
{
2577
TR::Symbol *sym = pit->_hostSymRef->getSymbol();
2578
if (!first) ir.print(", ");
2579
ir.print("%s", getTypeName(sym->getDataType()));
2580
first = false;
2581
}
2582
2583
ir.print("%s%s%s",
2584
numParms > 0 ? ", " : "",
2585
self()->comp()->isGPUCompilation() ? "" : "i32, i32, ", // startInclusive, endExclusive
2586
getTypeName(TR::Address)); // for ExceptionCheck
2587
2588
ir.print(")* @test%d, metadata !\"kernel\", i32 1}\n", gpuPtxCount);
2589
2590
nvvmIR = ir.getString();
2591
2592
traceNVVMIR(self()->comp(), nvvmIR);
2593
2594
//if any of these are set, it means this kernel may trigger a Java exception
2595
*hasExceptionChecks = (_gpuHasNullCheck || _gpuHasBndCheck || _gpuHasDivCheck);
2596
2597
return GPUSuccess;
2598
}
2599
2600
2601
void
2602
J9::CodeGenerator::generateGPU()
2603
{
2604
if (self()->comp()->isGPUCompilation())
2605
{
2606
char *programSource;
2607
TR::Node *errorNode;
2608
GPUResult result;
2609
TR::ResolvedMethodSymbol *method = self()->comp()->getJittedMethodSymbol();
2610
2611
{
2612
TR::StackMemoryRegion stackMemoryRegion(*self()->trMemory());
2613
2614
result = self()->dumpNVVMIR(self()->comp()->getStartTree(), self()->comp()->findLastTree(),
2615
NULL,
2616
NULL,
2617
&method->getAutomaticList(),
2618
&method->getParameterList(),
2619
false, // TODO: check if method is static
2620
programSource, errorNode, 0, 0); //gpuPtxCount is not applicable here so it is always set to 0.
2621
2622
} // scope of the stack memory region
2623
2624
self()->comp()->getOptimizationPlan()->setGPUResult(result);
2625
2626
if (result == GPUSuccess)
2627
{
2628
self()->comp()->getOptimizationPlan()->setGPUIR(programSource);
2629
}
2630
2631
if (!self()->comp()->isGPUCompileCPUCode())
2632
return;
2633
2634
TR::CFG *cfg = self()->comp()->getFlowGraph();
2635
TR::Block *startBlock = self()->comp()->getStartBlock();
2636
startBlock->split(self()->comp()->getStartTree()->getNextTreeTop(), cfg, false, false);
2637
2638
ListAppender<TR::ParameterSymbol> la(&method->getParameterList());
2639
la.empty(); // empty current parameter list
2640
2641
TR::ParameterSymbol *parmSymbol;
2642
int slot = 0;
2643
2644
parmSymbol = method->comp()->getSymRefTab()->createParameterSymbol(method, slot, TR::Address);
2645
parmSymbol->setOrdinal(slot++);
2646
parmSymbol->setReferencedParameter();
2647
parmSymbol->setLinkageRegisterIndex(0);
2648
parmSymbol->setTypeSignature("", 0);
2649
la.add(parmSymbol);
2650
2651
parmSymbol = method->comp()->getSymRefTab()->createParameterSymbol(method, slot, TR::Address);
2652
parmSymbol->setOrdinal(slot++);
2653
parmSymbol->setReferencedParameter();
2654
parmSymbol->setLinkageRegisterIndex(1);
2655
parmSymbol->setTypeSignature("", 0);
2656
la.add(parmSymbol);
2657
2658
parmSymbol = method->comp()->getSymRefTab()->createParameterSymbol(method, slot, TR::Address);
2659
parmSymbol->setOrdinal(slot++);
2660
parmSymbol->setReferencedParameter();
2661
parmSymbol->setLinkageRegisterIndex(2);
2662
parmSymbol->setTypeSignature("", 0);
2663
la.add(parmSymbol);
2664
2665
parmSymbol = method->comp()->getSymRefTab()->createParameterSymbol(method, slot, TR::Address);
2666
parmSymbol->setOrdinal(slot++);
2667
parmSymbol->setReferencedParameter();
2668
parmSymbol->setLinkageRegisterIndex(3);
2669
parmSymbol->setTypeSignature("", 0);
2670
la.add(parmSymbol);
2671
2672
parmSymbol = method->comp()->getSymRefTab()->createParameterSymbol(method, slot, TR::Int32);
2673
parmSymbol->setOrdinal(slot++);
2674
parmSymbol->setLinkageRegisterIndex(4);
2675
parmSymbol->setTypeSignature("", 0);
2676
la.add(parmSymbol);
2677
2678
parmSymbol = method->comp()->getSymRefTab()->createParameterSymbol(method, slot, TR::Int32);
2679
parmSymbol->setOrdinal(slot++);
2680
parmSymbol->setReferencedParameter();
2681
parmSymbol->setLinkageRegisterIndex(5);
2682
parmSymbol->setTypeSignature("", 0);
2683
la.add(parmSymbol);
2684
2685
parmSymbol = method->comp()->getSymRefTab()->createParameterSymbol(method, slot, TR::Int32);
2686
parmSymbol->setOrdinal(slot++);
2687
parmSymbol->setReferencedParameter();
2688
parmSymbol->setLinkageRegisterIndex(6);
2689
parmSymbol->setTypeSignature("", 0);
2690
la.add(parmSymbol);
2691
2692
parmSymbol = method->comp()->getSymRefTab()->createParameterSymbol(method, slot, TR::Int32);
2693
parmSymbol->setOrdinal(slot++);
2694
parmSymbol->setReferencedParameter();
2695
parmSymbol->setLinkageRegisterIndex(7);
2696
parmSymbol->setTypeSignature("", 0);
2697
la.add(parmSymbol);
2698
2699
parmSymbol = method->comp()->getSymRefTab()->createParameterSymbol(method, slot, TR::Int32);
2700
parmSymbol->setOrdinal(slot++);
2701
parmSymbol->setReferencedParameter();
2702
parmSymbol->setTypeSignature("", 0);
2703
la.add(parmSymbol);
2704
2705
parmSymbol = method->comp()->getSymRefTab()->createParameterSymbol(method, slot, TR::Int32);
2706
parmSymbol->setOrdinal(slot++);
2707
parmSymbol->setReferencedParameter();
2708
parmSymbol->setTypeSignature("", 0);
2709
la.add(parmSymbol);
2710
2711
parmSymbol = method->comp()->getSymRefTab()->createParameterSymbol(method, slot, TR::Int32);
2712
parmSymbol->setOrdinal(slot++);
2713
parmSymbol->setReferencedParameter();
2714
parmSymbol->setTypeSignature("", 0);
2715
la.add(parmSymbol);
2716
2717
2718
parmSymbol = method->comp()->getSymRefTab()->createParameterSymbol(method, slot, TR::Int32);
2719
parmSymbol->setOrdinal(slot++);
2720
parmSymbol->setReferencedParameter();
2721
parmSymbol->setTypeSignature("", 0);
2722
la.add(parmSymbol);
2723
2724
parmSymbol = method->comp()->getSymRefTab()->createParameterSymbol(method, slot, TR::Address);
2725
parmSymbol->setOrdinal(slot++);
2726
parmSymbol->setReferencedParameter();
2727
parmSymbol->setTypeSignature("", 0);
2728
la.add(parmSymbol);
2729
2730
TR::Node *callNode, *parm;
2731
TR::SymbolReference *parmSymRef;
2732
callNode = TR::Node::create(self()->comp()->getStartTree()->getNode(), TR::icall, 13);
2733
2734
parm = TR::Node::create(callNode, TR::aload, 0); // vmThread
2735
parmSymRef = self()->comp()->getSymRefTab()->findOrCreateAutoSymbol(method, 0, TR::Address);
2736
parm->setSymbolReference(parmSymRef);
2737
callNode->setAndIncChild(0, parm);
2738
2739
parm = TR::Node::create(callNode, TR::aload, 0); // method
2740
parmSymRef = self()->comp()->getSymRefTab()->findOrCreateAutoSymbol(method, 1, TR::Address);
2741
parm->setSymbolReference(parmSymRef);
2742
callNode->setAndIncChild(1, parm);
2743
2744
parm = TR::Node::create(callNode, TR::aload, 0); // programSource
2745
parmSymRef = self()->comp()->getSymRefTab()->findOrCreateAutoSymbol(method, 2, TR::Address);
2746
parm->setSymbolReference(parmSymRef);
2747
callNode->setAndIncChild(2, parm);
2748
2749
parm = TR::Node::create(callNode, TR::aload, 0); // invokeObject
2750
parmSymRef = self()->comp()->getSymRefTab()->findOrCreateAutoSymbol(method, 3, TR::Address);
2751
parm->setSymbolReference(parmSymRef);
2752
callNode->setAndIncChild(3, parm);
2753
2754
parm = TR::Node::create(callNode, TR::iload, 0); // deviceId
2755
parmSymRef = self()->comp()->getSymRefTab()->findOrCreateAutoSymbol(method, 4, TR::Int32);
2756
parm->setSymbolReference(parmSymRef);
2757
callNode->setAndIncChild(4, parm);
2758
2759
parm = TR::Node::create(callNode, TR::iload, 0); // gridDimX
2760
parmSymRef = self()->comp()->getSymRefTab()->findOrCreateAutoSymbol(method, 5, TR::Int32);
2761
parm->setSymbolReference(parmSymRef);
2762
callNode->setAndIncChild(5, parm);
2763
2764
parm = TR::Node::create(callNode, TR::iload, 0); // gridDimY
2765
parmSymRef = self()->comp()->getSymRefTab()->findOrCreateAutoSymbol(method, 6, TR::Int32);
2766
parm->setSymbolReference(parmSymRef);
2767
callNode->setAndIncChild(6, parm);
2768
2769
parm = TR::Node::create(callNode, TR::iload, 0); // gridDimZ
2770
parmSymRef = self()->comp()->getSymRefTab()->findOrCreateAutoSymbol(method, 7, TR::Int32);
2771
parm->setSymbolReference(parmSymRef);
2772
callNode->setAndIncChild(7, parm);
2773
2774
parm = TR::Node::create(callNode, TR::iload, 0); // blockDimX
2775
parmSymRef = self()->comp()->getSymRefTab()->findOrCreateAutoSymbol(method, 8, TR::Int32);
2776
parm->setSymbolReference(parmSymRef);
2777
callNode->setAndIncChild(8, parm);
2778
2779
parm = TR::Node::create(callNode, TR::iload, 0); // blockDimY
2780
parmSymRef = self()->comp()->getSymRefTab()->findOrCreateAutoSymbol(method, 9, TR::Int32);
2781
parm->setSymbolReference(parmSymRef);
2782
callNode->setAndIncChild(9, parm);
2783
2784
parm = TR::Node::create(callNode, TR::iload, 0); // blockDimZ
2785
parmSymRef = self()->comp()->getSymRefTab()->findOrCreateAutoSymbol(method, 10, TR::Int32);
2786
parm->setSymbolReference(parmSymRef);
2787
callNode->setAndIncChild(10, parm);
2788
2789
parm = TR::Node::create(callNode, TR::iload, 0); // argCount
2790
parmSymRef = self()->comp()->getSymRefTab()->findOrCreateAutoSymbol(method, 11, TR::Int32);
2791
parm->setSymbolReference(parmSymRef);
2792
callNode->setAndIncChild(11, parm);
2793
2794
parm = TR::Node::create(callNode, TR::aload, 0); // args
2795
parmSymRef = self()->comp()->getSymRefTab()->findOrCreateAutoSymbol(method, 12, TR::Address);
2796
parm->setSymbolReference(parmSymRef);
2797
callNode->setAndIncChild(12, parm);
2798
2799
TR::SymbolReference *helper = self()->comp()->getSymRefTab()->findOrCreateRuntimeHelper(TR_callGPU);
2800
helper->getSymbol()->castToMethodSymbol()->setLinkage(TR_System);
2801
callNode->setSymbolReference(helper);
2802
TR::Node *treetop = TR::Node::create(callNode, TR::treetop, 1);
2803
treetop->setAndIncChild(0, callNode);
2804
TR::TreeTop *callTreeTop = TR::TreeTop::create(self()->comp(), treetop);
2805
self()->comp()->getStartTree()->insertAfter(callTreeTop);
2806
2807
TR::Node *returnNode = TR::Node::create(callNode, TR::ireturn, 1); // TODO: handle mismatching returns
2808
returnNode->setAndIncChild(0, callNode);
2809
TR::TreeTop *returnTreeTop = TR::TreeTop::create(self()->comp(), returnNode);
2810
callTreeTop->insertAfter(returnTreeTop);
2811
2812
}
2813
}
2814
2815
uintptr_t
2816
J9::CodeGenerator::objectLengthOffset()
2817
{
2818
return self()->fe()->getOffsetOfContiguousArraySizeField();
2819
}
2820
2821
uintptr_t
2822
J9::CodeGenerator::objectHeaderInvariant()
2823
{
2824
return self()->objectLengthOffset() + 4 /*length*/ ;
2825
}
2826
2827