CoCalc -- DataAccessAccelerator.hpp

GitHub Repository: PojavLauncherTeam/openj9
Path: blob/master/runtime/compiler/optimizer/DataAccessAccelerator.hpp
⁶⁰⁰⁰ views
1
/*******************************************************************************
2
 * Copyright (c) 2000, 2020 IBM Corp. and others
3
 *
4
 * This program and the accompanying materials are made available under
5
 * the terms of the Eclipse Public License 2.0 which accompanies this
6
 * distribution and is available at https://www.eclipse.org/legal/epl-2.0/
7
 * or the Apache License, Version 2.0 which accompanies this distribution and
8
 * is available at https://www.apache.org/licenses/LICENSE-2.0.
9
 *
10
 * This Source Code may also be made available under the following
11
 * Secondary Licenses when the conditions for such availability set
12
 * forth in the Eclipse Public License, v. 2.0 are satisfied: GNU
13
 * General Public License, version 2 with the GNU Classpath
14
 * Exception [1] and GNU General Public License, version 2 with the
15
 * OpenJDK Assembly Exception [2].
16
 *
17
 * [1] https://www.gnu.org/software/classpath/license.html
18
 * [2] http://openjdk.java.net/legal/assembly-exception.html
19
 *
20
 * SPDX-License-Identifier: EPL-2.0 OR Apache-2.0 OR GPL-2.0 WITH Classpath-exception-2.0 OR LicenseRef-GPL-2.0 WITH Assembly-exception
21
 *******************************************************************************/
22

23
#ifndef DATAACCESSACCELERATOR_INCL
24
#define DATAACCESSACCELERATOR_INCL
25

26
#include <stddef.h>
27
#include <stdint.h>
28
#include <vector>
29
#include "compile/Compilation.hpp"
30
#include "env/TRMemory.hpp"
31
#include "il/Block.hpp"
32
#include "il/ILOpCodes.hpp"
33
#include "il/Node.hpp"
34
#include "infra/Array.hpp"
35
#include "infra/Assert.hpp"
36
#include "infra/BitVector.hpp"
37
#include "infra/ILWalk.hpp"
38
#include "infra/List.hpp"
39
#include "optimizer/Optimization.hpp"
40
#include "optimizer/OptimizationManager.hpp"
41

42
namespace TR { class TreeTop; }
43

44
/** \brief
45
 *
46
 *  Transforms calls to recognized Data Access Accelerator (DAA) library methods into hardware semantically
47
 *  equivalent hardware intrinsics for the underlying platform.
48
 *
49
 *  \details
50
 *
51
 *  The Data Access Accelerator (DAA) library (com/ibm/dataaccess) found in IBM J9 Virtual Machine is a utility
52
 *  library for performing hardware accelerated operations on Java data types. All library methods have a Java
53
 *  implementation, however if hardware support exists for a particular operation the JIT compiler will attempt
54
 *  to replace calls to such library methods with semantically equivalent hardware intrinsics.
55
 *
56
 *  The DAA library method calls that are actually replaced with hardware intrinsics are not the publically visible
57
 *  API methods. Instead we replace the private "underscore" counterparts to the public API so as to guard us for
58
 *  possible future modifications of such private methods. For example the
59
 *  com/ibm/dataaccess/ByteArrayMarshaller.writeInt(I[BIZ)V method calls a private
60
 *  com/ibm/dataaccess/ByteArrayMarshaller.writeInt_(I[BIZ)V method which carries out the write operation. The
61
 *  latter method is the so called "underscore" method which we recognize and hardware accelerate.
62

63
 *  Note that if hardware acceleration support is detected we prevent the inlining of such "underscore" methods
64
 *  in the inliner on the assumption that this optimization will reduce such calls to simple semantically
65
 *  equivalent trees which will outperform the inlined call.
66
 *
67
 *  The DAA library is broken up into four main classes and hardware support is defined per method as follows:
68
 *
69
 *  \section com.ibm.data.ByteArrayMarshaller
70
 *
71
 *  The following methods have hardware acceleration support on x86 (Linux and Windows), PPC (Linux and AIX),
72
 *  and System z (Linux and z/OS):
73
 *
74
 *  - com/ibm/dataaccess/ByteArrayMarshaller.writeShort_(S[BIZ)V
75
 *  - com/ibm/dataaccess/ByteArrayMarshaller.writeShort_(S[BIZI)V
76
 *  - com/ibm/dataaccess/ByteArrayMarshaller.writeInt_(I[BIZ)V
77
 *  - com/ibm/dataaccess/ByteArrayMarshaller.writeInt_(I[BIZI)V
78
 *  - com/ibm/dataaccess/ByteArrayMarshaller.writeLong_(J[BIZ)V
79
 *  - com/ibm/dataaccess/ByteArrayMarshaller.writeLong_(J[BIZI)V
80
 *  - com/ibm/dataaccess/ByteArrayMarshaller.writeFloat_(F[BIZ)V
81
 *  - com/ibm/dataaccess/ByteArrayMarshaller.writeDouble_(D[BIZ)V
82
 *
83
 *  \section com.ibm.data.ByteArrayUnMarshaller
84
 *
85
 *  The following methods have hardware acceleration support on x86 (Linux and Windows), PPC (Linux and AIX),
86
 *  and System z (Linux and z/OS):
87
 *
88
 *  - com/ibm/dataaccess/ByteArrayUnmarshaller.readShort_([BIZ)S
89
 *  - com/ibm/dataaccess/ByteArrayUnmarshaller.readShort_([BIZIZ)S
90
 *  - com/ibm/dataaccess/ByteArrayUnmarshaller.readInt_([BIZ)I
91
 *  - com/ibm/dataaccess/ByteArrayUnmarshaller.readInt_([BIZIZ)I
92
 *  - com/ibm/dataaccess/ByteArrayUnmarshaller.readLong_([BIZ)J
93
 *  - com/ibm/dataaccess/ByteArrayUnmarshaller.readLong_([BIZIZ)J
94
 *  - com/ibm/dataaccess/ByteArrayUnmarshaller.readFloat_([BIZ)F
95
 *  - com/ibm/dataaccess/ByteArrayUnmarshaller.readDouble_([BIZ)D
96
 *
97
 *  \section com.ibm.data.DecimalData
98
 *
99
 *  The following methods have hardware acceleration support on System z (Linux and z/OS):
100
 *
101
 *  - com/ibm/dataaccess/DecimalData.convertPackedDecimalToUnicodeDecimal_([BI[CIII)V
102
 *  - com/ibm/dataaccess/DecimalData.convertUnicodeDecimalToPackedDecimal_([CI[BIII)V
103
 *  - com/ibm/dataaccess/DecimalData.convertPackedDecimalToExternalDecimal_([BI[BIII)V
104
 *  - com/ibm/dataaccess/DecimalData.convertExternalDecimalToPackedDecimal_([BI[BIII)V
105
 *
106
 *  The following methods have hardware acceleration support on System z (z/OS):
107
 *
108
 *  - com/ibm/dataaccess/DecimalData.convertPackedDecimalToInteger_([BIIZ)I
109
 *  - com/ibm/dataaccess/DecimalData.convertPackedDecimalToInteger_(Ljava/nio/ByteBuffer;IIZJII)I
110
 *  - com/ibm/dataaccess/DecimalData.convertIntegerToPackedDecimal_(I[BIIZ)V
111
 *  - com/ibm/dataaccess/DecimalData.convertIntegerToPackedDecimal_(ILjava/nio/ByteBuffer;IIZJII)V
112
 *  - com/ibm/dataaccess/DecimalData.convertPackedDecimalToLong_([BIIZ)J
113
 *  - com/ibm/dataaccess/DecimalData.convertPackedDecimalToLong_(Ljava/nio/ByteBuffer;IIZJII)J
114
 *  - com/ibm/dataaccess/DecimalData.convertLongToPackedDecimal_(J[BIIZ)V
115
 *  - com/ibm/dataaccess/DecimalData.convertLongToPackedDecimal_(JLjava/nio/ByteBuffer;IIZJII)V
116
 *
117
 *  \section com.ibm.data.PackedDecimal
118
 *
119
 *  The following methods have hardware acceleration support on System z (Linux and z/OS):
120
 *
121
 *  - com/ibm/dataaccess/PackedDecimal.checkPackedDecimal_([BIIZZ)I
122
 *
123
 *  The following methods have hardware acceleration support on System z (z/OS):
124
 *
125
 *  - com/ibm/dataaccess/PackedDecimal.addPackedDecimal_([BII[BII[BIIZ)V
126
 *  - com/ibm/dataaccess/PackedDecimal.subtractPackedDecimal_([BII[BII[BIIZ)V
127
 *  - com/ibm/dataaccess/PackedDecimal.multiplyPackedDecimal_([BII[BII[BIIZ)V
128
 *  - com/ibm/dataaccess/PackedDecimal.dividePackedDecimal_([BII[BII[BIIZ)V
129
 *  - com/ibm/dataaccess/PackedDecimal.remainderPackedDecimal_([BII[BII[BIIZ)V
130
 *  - com/ibm/dataaccess/PackedDecimal.shiftLeftPackedDecimal_([BII[BIIIZ)V
131
 *  - com/ibm/dataaccess/PackedDecimal.shiftRightPackedDecimal_([BII[BIIIZ)V
132
 *  - com/ibm/dataaccess/PackedDecimal.lessThanPackedDecimal_([BII[BII)Z
133
 *  - com/ibm/dataaccess/PackedDecimal.lessThanOrEqualsPackedDecimal_([BII[BII)Z
134
 *  - com/ibm/dataaccess/PackedDecimal.greaterThanPackedDecimal_([BII[BII)Z
135
 *  - com/ibm/dataaccess/PackedDecimal.greaterThanOrEqualsPackedDecimal_([BII[BII)Z
136
 *  - com/ibm/dataaccess/PackedDecimal.equalsPackedDecimal_([BII[BII)Z
137
 *
138
 */
139
class TR_DataAccessAccelerator : public TR::Optimization
140
   {
141
   public:
142
   typedef TR::typed_allocator< TR::TreeTop *, TR::Region & > TreeTopContainerAllocator;
143
   typedef std::vector< TR::TreeTop*, TreeTopContainerAllocator > TreeTopContainer;
144
   typedef TR::typed_allocator< TR::Block *, TR::Region & > BlockContainerAllocator;
145
   typedef std::vector< TR::Block*, BlockContainerAllocator > BlockContainer;
146

147
   TR_DataAccessAccelerator(TR::OptimizationManager* manager);
148

149
   /** \brief
150
    *     Helper function to create an instance of the StringBuilderTransformer optimization using the
151
    *     OptimizationManager's default allocator.
152
    *
153
    *  \param manager
154
    *     The optimization manager.
155
    */
156
   static TR::Optimization* create(TR::OptimizationManager* manager)
157
      {
158
      return new (manager->allocator()) TR_DataAccessAccelerator(manager);
159
      }
160

161
   /** \brief
162
    *     Performs the optimization on this compilation unit.
163
    *
164
    *  \return
165
    *     1 if any transformation was performed; 0 otherwise.
166
    */
167
   virtual int32_t perform();
168

169
   /** \brief
170
    *     Performs the optimization on a specific block within this compilation unit.
171
    *
172
    *  \param block
173
    *     The block on which to perform this optimization.
174
    *
175
    *  \param variableCallTreeTops
176
    *     A vector of TR::TreeTop*. Used to build a list of variable precision calls to be used
177
    *     later
178
    *
179
    *  \return
180
    *     1 if any transformation was performed; 0 otherwise.
181
    */
182
   virtual int32_t performOnBlock(TR::Block* block, TreeTopContainer* variableCallTreeTops);
183

184
   /** \brief
185
    *     Performs inlining of variable precision API calls after iterating through the entire tree
186
    *
187
    *  \detail
188
    *     Unlike constant precision DAA call inlining which can be done in-place without introducing extra blocks,
189
    *     each variable precision call node has to be bloated into multiple
190
    *     basic blocks to form a precision diamond. This disrupts the CFG and invalidates block iterator and
191
    *     TreeTop iterator that might be in use. As a result of this, it's difficult to inline variable precision
192
    *     calls while iterating the entire tree. The solution to this problem is to build a list of variable
193
    *     precision TreeTops during the tree traversal phase. And after that, go through this list and inline each one of them.
194
    *
195
    *  \param variableCallTreeTops
196
    *     A vector of TR::TreeTop*. All variable precision calls listed here will be inlined.
197
    *
198
    *  \return
199
    *     1 if any transformation was performed; 0 otherwise.
200
    */
201
   virtual int32_t processVariableCalls(TreeTopContainer* variableCallTreeTops);
202

203
   virtual const char * optDetailString() const throw();
204

205
   bool isChildConst (TR::Node* node, int32_t child);
206

207
   TR::Node* insertIntegerGetIntrinsic(TR::TreeTop* callTreeTop, TR::Node* callNode, int32_t sourceNumBytes, int32_t targetNumBytes);
208
   TR::Node* insertIntegerSetIntrinsic(TR::TreeTop* callTreeTop, TR::Node* callNode, int32_t sourceNumBytes, int32_t targetNumBytes);
209

210
   TR::Node* insertDecimalGetIntrinsic(TR::TreeTop* callTreeTop, TR::Node* callNode, int32_t sourceNumBytes, int32_t targetNumBytes);
211
   TR::Node* insertDecimalSetIntrinsic(TR::TreeTop* callTreeTop, TR::Node* callNode, int32_t sourceNumBytes, int32_t targetNumBytes);
212

213
   bool inlineCheckPackedDecimal(TR::TreeTop* callTreeTop, TR::Node* callNode);
214

215
   private:
216

217
   TR::Node* constructAddressNode(TR::Node* callNode, TR::Node* arrayNode, TR::Node* offsetNode);
218

219
   void createPrecisionDiamond(TR::Compilation* comp,
220
                               TR::TreeTop* treeTop,
221
                               TR::TreeTop* fastTree, TR::TreeTop* slowTree,
222
                               bool isPD2I,
223
                               uint32_t numPrecisionNodes,
224
                               ...);
225

226
   TR::Node* restructureVariablePrecisionCallNode(TR::TreeTop* treeTop, TR::Node* callNode);
227

228
   bool generatePD2I(TR::TreeTop* treeTop, TR::Node* callNode, bool isPD2i, bool isByteBuffer);
229
   bool generatePD2IVariableParameter(TR::TreeTop* treeTop, TR::Node* callNode, bool isPD2i, bool isByteBuffer);
230
   bool generatePD2IConstantParameter(TR::TreeTop* treeTop, TR::Node* callNode, bool isPD2i, bool isByteBuffer);
231
   bool generateI2PD(TR::TreeTop* treeTop, TR::Node* callNode, bool isI2PD, bool isByteBuffer);
232
   bool genArithmeticIntrinsic(TR::TreeTop* treeTop, TR::Node* callNode, TR::ILOpCodes opCode);
233
   bool genComparisionIntrinsic(TR::TreeTop* treeTop, TR::Node* callNode, TR::ILOpCodes opCode);
234
   bool genShiftLeftIntrinsic(TR::TreeTop* treeTop, TR::Node* callNode);
235
   bool genShiftRightIntrinsic(TR::TreeTop* treeTop, TR::Node* callNode);
236
   bool generateUD2PD(TR::TreeTop* treeTop, TR::Node* callNode, bool isUD2PD);
237
   bool generatePD2UD(TR::TreeTop* treeTop, TR::Node* callNode, bool isPD2UD);
238

239
   void insertByteArrayNULLCHK(TR::TreeTop* callTreeTop, TR::Node* callNode, TR::Node* byteArrayNode);
240
   void insertByteArrayBNDCHK(TR::TreeTop* callTreeTop, TR::Node* callNode, TR::Node* byteArrayNode, TR::Node* offsetNode, int32_t index);
241

242
   TR::Node* createByteArrayElementAddress(TR::TreeTop* callTreeTop, TR::Node* callNode, TR::Node* byteArrayNode, TR::Node* offsetNode);
243

244
   bool printInliningStatus(bool status, TR::Node* node, const char* reason = "")
245
      {
246
      if (trace()) 
247
         {
248
            if (status)
249
               traceMsg(comp(), "DataAccessAccelerator: Intrinsics on node %p : SUCCESS\n", node);
250
            else
251
               {
252
               traceMsg(comp(), "DataAccessAccelerator: Intrinsics on node %p : FAILED\n", node);
253
               traceMsg(comp(), "DataAccessAccelerator:     Reason : %s\n", reason);
254
               }
255
         }
256
      return status;
257
      }
258
   };
259

260
#endif
261

262
Product

Resources

Company