Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
PojavLauncherTeam
GitHub Repository: PojavLauncherTeam/openj9
Path: blob/master/runtime/compiler/optimizer/DataAccessAccelerator.hpp
6000 views
1
/*******************************************************************************
2
* Copyright (c) 2000, 2020 IBM Corp. and others
3
*
4
* This program and the accompanying materials are made available under
5
* the terms of the Eclipse Public License 2.0 which accompanies this
6
* distribution and is available at https://www.eclipse.org/legal/epl-2.0/
7
* or the Apache License, Version 2.0 which accompanies this distribution and
8
* is available at https://www.apache.org/licenses/LICENSE-2.0.
9
*
10
* This Source Code may also be made available under the following
11
* Secondary Licenses when the conditions for such availability set
12
* forth in the Eclipse Public License, v. 2.0 are satisfied: GNU
13
* General Public License, version 2 with the GNU Classpath
14
* Exception [1] and GNU General Public License, version 2 with the
15
* OpenJDK Assembly Exception [2].
16
*
17
* [1] https://www.gnu.org/software/classpath/license.html
18
* [2] http://openjdk.java.net/legal/assembly-exception.html
19
*
20
* SPDX-License-Identifier: EPL-2.0 OR Apache-2.0 OR GPL-2.0 WITH Classpath-exception-2.0 OR LicenseRef-GPL-2.0 WITH Assembly-exception
21
*******************************************************************************/
22
23
#ifndef DATAACCESSACCELERATOR_INCL
24
#define DATAACCESSACCELERATOR_INCL
25
26
#include <stddef.h>
27
#include <stdint.h>
28
#include <vector>
29
#include "compile/Compilation.hpp"
30
#include "env/TRMemory.hpp"
31
#include "il/Block.hpp"
32
#include "il/ILOpCodes.hpp"
33
#include "il/Node.hpp"
34
#include "infra/Array.hpp"
35
#include "infra/Assert.hpp"
36
#include "infra/BitVector.hpp"
37
#include "infra/ILWalk.hpp"
38
#include "infra/List.hpp"
39
#include "optimizer/Optimization.hpp"
40
#include "optimizer/OptimizationManager.hpp"
41
42
namespace TR { class TreeTop; }
43
44
/** \brief
45
*
46
* Transforms calls to recognized Data Access Accelerator (DAA) library methods into hardware semantically
47
* equivalent hardware intrinsics for the underlying platform.
48
*
49
* \details
50
*
51
* The Data Access Accelerator (DAA) library (com/ibm/dataaccess) found in IBM J9 Virtual Machine is a utility
52
* library for performing hardware accelerated operations on Java data types. All library methods have a Java
53
* implementation, however if hardware support exists for a particular operation the JIT compiler will attempt
54
* to replace calls to such library methods with semantically equivalent hardware intrinsics.
55
*
56
* The DAA library method calls that are actually replaced with hardware intrinsics are not the publically visible
57
* API methods. Instead we replace the private "underscore" counterparts to the public API so as to guard us for
58
* possible future modifications of such private methods. For example the
59
* com/ibm/dataaccess/ByteArrayMarshaller.writeInt(I[BIZ)V method calls a private
60
* com/ibm/dataaccess/ByteArrayMarshaller.writeInt_(I[BIZ)V method which carries out the write operation. The
61
* latter method is the so called "underscore" method which we recognize and hardware accelerate.
62
63
* Note that if hardware acceleration support is detected we prevent the inlining of such "underscore" methods
64
* in the inliner on the assumption that this optimization will reduce such calls to simple semantically
65
* equivalent trees which will outperform the inlined call.
66
*
67
* The DAA library is broken up into four main classes and hardware support is defined per method as follows:
68
*
69
* \section com.ibm.data.ByteArrayMarshaller
70
*
71
* The following methods have hardware acceleration support on x86 (Linux and Windows), PPC (Linux and AIX),
72
* and System z (Linux and z/OS):
73
*
74
* - com/ibm/dataaccess/ByteArrayMarshaller.writeShort_(S[BIZ)V
75
* - com/ibm/dataaccess/ByteArrayMarshaller.writeShort_(S[BIZI)V
76
* - com/ibm/dataaccess/ByteArrayMarshaller.writeInt_(I[BIZ)V
77
* - com/ibm/dataaccess/ByteArrayMarshaller.writeInt_(I[BIZI)V
78
* - com/ibm/dataaccess/ByteArrayMarshaller.writeLong_(J[BIZ)V
79
* - com/ibm/dataaccess/ByteArrayMarshaller.writeLong_(J[BIZI)V
80
* - com/ibm/dataaccess/ByteArrayMarshaller.writeFloat_(F[BIZ)V
81
* - com/ibm/dataaccess/ByteArrayMarshaller.writeDouble_(D[BIZ)V
82
*
83
* \section com.ibm.data.ByteArrayUnMarshaller
84
*
85
* The following methods have hardware acceleration support on x86 (Linux and Windows), PPC (Linux and AIX),
86
* and System z (Linux and z/OS):
87
*
88
* - com/ibm/dataaccess/ByteArrayUnmarshaller.readShort_([BIZ)S
89
* - com/ibm/dataaccess/ByteArrayUnmarshaller.readShort_([BIZIZ)S
90
* - com/ibm/dataaccess/ByteArrayUnmarshaller.readInt_([BIZ)I
91
* - com/ibm/dataaccess/ByteArrayUnmarshaller.readInt_([BIZIZ)I
92
* - com/ibm/dataaccess/ByteArrayUnmarshaller.readLong_([BIZ)J
93
* - com/ibm/dataaccess/ByteArrayUnmarshaller.readLong_([BIZIZ)J
94
* - com/ibm/dataaccess/ByteArrayUnmarshaller.readFloat_([BIZ)F
95
* - com/ibm/dataaccess/ByteArrayUnmarshaller.readDouble_([BIZ)D
96
*
97
* \section com.ibm.data.DecimalData
98
*
99
* The following methods have hardware acceleration support on System z (Linux and z/OS):
100
*
101
* - com/ibm/dataaccess/DecimalData.convertPackedDecimalToUnicodeDecimal_([BI[CIII)V
102
* - com/ibm/dataaccess/DecimalData.convertUnicodeDecimalToPackedDecimal_([CI[BIII)V
103
* - com/ibm/dataaccess/DecimalData.convertPackedDecimalToExternalDecimal_([BI[BIII)V
104
* - com/ibm/dataaccess/DecimalData.convertExternalDecimalToPackedDecimal_([BI[BIII)V
105
*
106
* The following methods have hardware acceleration support on System z (z/OS):
107
*
108
* - com/ibm/dataaccess/DecimalData.convertPackedDecimalToInteger_([BIIZ)I
109
* - com/ibm/dataaccess/DecimalData.convertPackedDecimalToInteger_(Ljava/nio/ByteBuffer;IIZJII)I
110
* - com/ibm/dataaccess/DecimalData.convertIntegerToPackedDecimal_(I[BIIZ)V
111
* - com/ibm/dataaccess/DecimalData.convertIntegerToPackedDecimal_(ILjava/nio/ByteBuffer;IIZJII)V
112
* - com/ibm/dataaccess/DecimalData.convertPackedDecimalToLong_([BIIZ)J
113
* - com/ibm/dataaccess/DecimalData.convertPackedDecimalToLong_(Ljava/nio/ByteBuffer;IIZJII)J
114
* - com/ibm/dataaccess/DecimalData.convertLongToPackedDecimal_(J[BIIZ)V
115
* - com/ibm/dataaccess/DecimalData.convertLongToPackedDecimal_(JLjava/nio/ByteBuffer;IIZJII)V
116
*
117
* \section com.ibm.data.PackedDecimal
118
*
119
* The following methods have hardware acceleration support on System z (Linux and z/OS):
120
*
121
* - com/ibm/dataaccess/PackedDecimal.checkPackedDecimal_([BIIZZ)I
122
*
123
* The following methods have hardware acceleration support on System z (z/OS):
124
*
125
* - com/ibm/dataaccess/PackedDecimal.addPackedDecimal_([BII[BII[BIIZ)V
126
* - com/ibm/dataaccess/PackedDecimal.subtractPackedDecimal_([BII[BII[BIIZ)V
127
* - com/ibm/dataaccess/PackedDecimal.multiplyPackedDecimal_([BII[BII[BIIZ)V
128
* - com/ibm/dataaccess/PackedDecimal.dividePackedDecimal_([BII[BII[BIIZ)V
129
* - com/ibm/dataaccess/PackedDecimal.remainderPackedDecimal_([BII[BII[BIIZ)V
130
* - com/ibm/dataaccess/PackedDecimal.shiftLeftPackedDecimal_([BII[BIIIZ)V
131
* - com/ibm/dataaccess/PackedDecimal.shiftRightPackedDecimal_([BII[BIIIZ)V
132
* - com/ibm/dataaccess/PackedDecimal.lessThanPackedDecimal_([BII[BII)Z
133
* - com/ibm/dataaccess/PackedDecimal.lessThanOrEqualsPackedDecimal_([BII[BII)Z
134
* - com/ibm/dataaccess/PackedDecimal.greaterThanPackedDecimal_([BII[BII)Z
135
* - com/ibm/dataaccess/PackedDecimal.greaterThanOrEqualsPackedDecimal_([BII[BII)Z
136
* - com/ibm/dataaccess/PackedDecimal.equalsPackedDecimal_([BII[BII)Z
137
*
138
*/
139
class TR_DataAccessAccelerator : public TR::Optimization
140
{
141
public:
142
typedef TR::typed_allocator< TR::TreeTop *, TR::Region & > TreeTopContainerAllocator;
143
typedef std::vector< TR::TreeTop*, TreeTopContainerAllocator > TreeTopContainer;
144
typedef TR::typed_allocator< TR::Block *, TR::Region & > BlockContainerAllocator;
145
typedef std::vector< TR::Block*, BlockContainerAllocator > BlockContainer;
146
147
TR_DataAccessAccelerator(TR::OptimizationManager* manager);
148
149
/** \brief
150
* Helper function to create an instance of the StringBuilderTransformer optimization using the
151
* OptimizationManager's default allocator.
152
*
153
* \param manager
154
* The optimization manager.
155
*/
156
static TR::Optimization* create(TR::OptimizationManager* manager)
157
{
158
return new (manager->allocator()) TR_DataAccessAccelerator(manager);
159
}
160
161
/** \brief
162
* Performs the optimization on this compilation unit.
163
*
164
* \return
165
* 1 if any transformation was performed; 0 otherwise.
166
*/
167
virtual int32_t perform();
168
169
/** \brief
170
* Performs the optimization on a specific block within this compilation unit.
171
*
172
* \param block
173
* The block on which to perform this optimization.
174
*
175
* \param variableCallTreeTops
176
* A vector of TR::TreeTop*. Used to build a list of variable precision calls to be used
177
* later
178
*
179
* \return
180
* 1 if any transformation was performed; 0 otherwise.
181
*/
182
virtual int32_t performOnBlock(TR::Block* block, TreeTopContainer* variableCallTreeTops);
183
184
/** \brief
185
* Performs inlining of variable precision API calls after iterating through the entire tree
186
*
187
* \detail
188
* Unlike constant precision DAA call inlining which can be done in-place without introducing extra blocks,
189
* each variable precision call node has to be bloated into multiple
190
* basic blocks to form a precision diamond. This disrupts the CFG and invalidates block iterator and
191
* TreeTop iterator that might be in use. As a result of this, it's difficult to inline variable precision
192
* calls while iterating the entire tree. The solution to this problem is to build a list of variable
193
* precision TreeTops during the tree traversal phase. And after that, go through this list and inline each one of them.
194
*
195
* \param variableCallTreeTops
196
* A vector of TR::TreeTop*. All variable precision calls listed here will be inlined.
197
*
198
* \return
199
* 1 if any transformation was performed; 0 otherwise.
200
*/
201
virtual int32_t processVariableCalls(TreeTopContainer* variableCallTreeTops);
202
203
virtual const char * optDetailString() const throw();
204
205
bool isChildConst (TR::Node* node, int32_t child);
206
207
TR::Node* insertIntegerGetIntrinsic(TR::TreeTop* callTreeTop, TR::Node* callNode, int32_t sourceNumBytes, int32_t targetNumBytes);
208
TR::Node* insertIntegerSetIntrinsic(TR::TreeTop* callTreeTop, TR::Node* callNode, int32_t sourceNumBytes, int32_t targetNumBytes);
209
210
TR::Node* insertDecimalGetIntrinsic(TR::TreeTop* callTreeTop, TR::Node* callNode, int32_t sourceNumBytes, int32_t targetNumBytes);
211
TR::Node* insertDecimalSetIntrinsic(TR::TreeTop* callTreeTop, TR::Node* callNode, int32_t sourceNumBytes, int32_t targetNumBytes);
212
213
bool inlineCheckPackedDecimal(TR::TreeTop* callTreeTop, TR::Node* callNode);
214
215
private:
216
217
TR::Node* constructAddressNode(TR::Node* callNode, TR::Node* arrayNode, TR::Node* offsetNode);
218
219
void createPrecisionDiamond(TR::Compilation* comp,
220
TR::TreeTop* treeTop,
221
TR::TreeTop* fastTree, TR::TreeTop* slowTree,
222
bool isPD2I,
223
uint32_t numPrecisionNodes,
224
...);
225
226
TR::Node* restructureVariablePrecisionCallNode(TR::TreeTop* treeTop, TR::Node* callNode);
227
228
bool generatePD2I(TR::TreeTop* treeTop, TR::Node* callNode, bool isPD2i, bool isByteBuffer);
229
bool generatePD2IVariableParameter(TR::TreeTop* treeTop, TR::Node* callNode, bool isPD2i, bool isByteBuffer);
230
bool generatePD2IConstantParameter(TR::TreeTop* treeTop, TR::Node* callNode, bool isPD2i, bool isByteBuffer);
231
bool generateI2PD(TR::TreeTop* treeTop, TR::Node* callNode, bool isI2PD, bool isByteBuffer);
232
bool genArithmeticIntrinsic(TR::TreeTop* treeTop, TR::Node* callNode, TR::ILOpCodes opCode);
233
bool genComparisionIntrinsic(TR::TreeTop* treeTop, TR::Node* callNode, TR::ILOpCodes opCode);
234
bool genShiftLeftIntrinsic(TR::TreeTop* treeTop, TR::Node* callNode);
235
bool genShiftRightIntrinsic(TR::TreeTop* treeTop, TR::Node* callNode);
236
bool generateUD2PD(TR::TreeTop* treeTop, TR::Node* callNode, bool isUD2PD);
237
bool generatePD2UD(TR::TreeTop* treeTop, TR::Node* callNode, bool isPD2UD);
238
239
void insertByteArrayNULLCHK(TR::TreeTop* callTreeTop, TR::Node* callNode, TR::Node* byteArrayNode);
240
void insertByteArrayBNDCHK(TR::TreeTop* callTreeTop, TR::Node* callNode, TR::Node* byteArrayNode, TR::Node* offsetNode, int32_t index);
241
242
TR::Node* createByteArrayElementAddress(TR::TreeTop* callTreeTop, TR::Node* callNode, TR::Node* byteArrayNode, TR::Node* offsetNode);
243
244
bool printInliningStatus(bool status, TR::Node* node, const char* reason = "")
245
{
246
if (trace())
247
{
248
if (status)
249
traceMsg(comp(), "DataAccessAccelerator: Intrinsics on node %p : SUCCESS\n", node);
250
else
251
{
252
traceMsg(comp(), "DataAccessAccelerator: Intrinsics on node %p : FAILED\n", node);
253
traceMsg(comp(), "DataAccessAccelerator: Reason : %s\n", reason);
254
}
255
}
256
return status;
257
}
258
};
259
260
#endif
261
262