Path: blob/master/runtime/compiler/z/codegen/ReduceSynchronizedFieldLoad.cpp
6004 views
/*******************************************************************************1* Copyright (c) 2000, 2021 IBM Corp. and others2*3* This program and the accompanying materials are made available under4* the terms of the Eclipse Public License 2.0 which accompanies this5* distribution and is available at https://www.eclipse.org/legal/epl-2.0/6* or the Apache License, Version 2.0 which accompanies this distribution and7* is available at https://www.apache.org/licenses/LICENSE-2.0.8*9* This Source Code may also be made available under the following10* Secondary Licenses when the conditions for such availability set11* forth in the Eclipse Public License, v. 2.0 are satisfied: GNU12* General Public License, version 2 with the GNU Classpath13* Exception [1] and GNU General Public License, version 2 with the14* OpenJDK Assembly Exception [2].15*16* [1] https://www.gnu.org/software/classpath/license.html17* [2] http://openjdk.java.net/legal/assembly-exception.html18*19* SPDX-License-Identifier: EPL-2.0 OR Apache-2.0 OR GPL-2.0 WITH Classpath-exception-2.0 OR LicenseRef-GPL-2.0 WITH Assembly-exception20*******************************************************************************/2122#include "codegen/ReduceSynchronizedFieldLoad.hpp"2324#include <stddef.h>25#include <stdint.h>26#include "j9.h"27#include "j9cfg.h"28#include "j9consts.h"29#include "codegen/CodeGenerator.hpp"30#include "codegen/Linkage_inlines.hpp"31#include "codegen/S390CHelperLinkage.hpp"32#include "codegen/TreeEvaluator.hpp"33#include "env/VMJ9.h"34#include "il/ILOps.hpp"35#include "il/ILOpCodes.hpp"36#include "il/Node_inlines.hpp"37#include "il/TreeTop.hpp"38#include "il/TreeTop_inlines.hpp"39#include "infra/Assert.hpp"40#include "z/codegen/S390Evaluator.hpp"41#include "z/codegen/S390GenerateInstructions.hpp"42#include "z/codegen/S390Instruction.hpp"43#include "z/codegen/S390Register.hpp"4445#define OPT_DETAILS "O^O REDUCE SYNCHRONIZED FIELD LOAD: "4647void48ReduceSynchronizedFieldLoad::inlineSynchronizedFieldLoad(TR::Node* node, TR::CodeGenerator* cg)49{50TR::Compilation *comp = cg->comp();51TR::Node* synchronizedObjectNode = node->getChild(0);5253// Materialize the object register first because LPD can only deal with base-displacement type memory references and54// because the object appears directly underneath the indirect load we are guaranteed not to have an index register55TR::Register* objectRegister = cg->evaluate(synchronizedObjectNode);5657TR::Node* loadNode = node->getChild(1);5859TR::Node* monentSymbolReferenceNode = node->getChild(3);60TR::Node* monexitSymbolReferenceNode = node->getChild(4);6162TR::LabelSymbol* mergeLabel = generateLabelSymbol(cg);63TR::LabelSymbol* fastPathLabel = generateLabelSymbol(cg);64TR::LabelSymbol* slowPathLabel = generateLabelSymbol(cg);6566generateS390LabelInstruction(cg, TR::InstOpCode::label, node, fastPathLabel);6768TR_S390OutOfLineCodeSection* outOfLineCodeSection = new (cg->trHeapMemory()) TR_S390OutOfLineCodeSection(slowPathLabel, mergeLabel, cg);69cg->getS390OutOfLineCodeSectionList().push_front(outOfLineCodeSection);70outOfLineCodeSection->swapInstructionListsWithCompilation();7172generateS390LabelInstruction(cg, TR::InstOpCode::label, node, slowPathLabel);7374// Generate a dynamic debug counter for the fallback path whose execution should be extremely rare75cg->generateDebugCounter(TR::DebugCounter::debugCounterName(comp, "codegen/z/ReduceSynchronizedFieldLoad/success/OOL/%s", comp->signature()));7677J9::Z::CHelperLinkage* helperLink = static_cast<J9::Z::CHelperLinkage*>(cg->getLinkage(TR_CHelper));7879// Calling helper with call node which should NULL80helperLink->buildDirectDispatch(monentSymbolReferenceNode);8182// The logic for evaluating a particular load is non-trivial in both evaluation sequence and setting of the various83// register flags (collected references, etc.). As such we evaluate the load preemptively and extract the84// materialized memory reference directly from the load itself for use in LPD.85TR::Register* loadRegister = cg->evaluate(loadNode);8687// Search for the load memory reference from the previously evaluated load88TR::Instruction* loadInstruction = cg->getAppendInstruction();8990TR_ASSERT_SAFE_FATAL(loadInstruction->isLoad() && (loadInstruction->getKind() == OMR::Instruction::Kind::IsRX || loadInstruction->getKind() == OMR::Instruction::Kind::IsRXY), "Expecting the append instruction to be a load of kind RX or RXY\n");9192TR::MemoryReference* loadMemoryReference = static_cast<TR::S390RXInstruction*>(loadInstruction)->getMemoryReference();9394TR_ASSERT_SAFE_FATAL(loadMemoryReference->getIndexRegister() == NULL, "Load memory reference must not have an index register\n");9596helperLink->buildDirectDispatch(monexitSymbolReferenceNode);9798generateS390BranchInstruction(cg, TR::InstOpCode::BRC, TR::InstOpCode::COND_BRC, node, mergeLabel);99100outOfLineCodeSection->swapInstructionListsWithCompilation();101102TR::Register* lockRegister = cg->allocateRegister();103TR::RegisterPair* registerPair = cg->allocateConsecutiveRegisterPair(lockRegister, loadRegister);104105TR::RegisterDependencyConditions* conditions = new (cg->trHeapMemory()) TR::RegisterDependencyConditions(0, 3, cg);106107conditions->addPostCondition(registerPair, TR::RealRegister::EvenOddPair);108conditions->addPostCondition(loadRegister, TR::RealRegister::LegalEvenOfPair);109conditions->addPostCondition(lockRegister, TR::RealRegister::LegalOddOfPair);110111// Recreate the memory reference since we cannot share the same one for fast and slow paths of the ICF diamond112loadMemoryReference = generateS390MemoryReference(*loadMemoryReference, 0, cg);113114TR::Node* lockWordOffsetNode = node->getChild(2);115116int32_t lockWordOffset = lockWordOffsetNode->getConst<int32_t>();117118TR::MemoryReference* lockMemoryReference = generateS390MemoryReference(objectRegister, lockWordOffset, cg);119120const bool generateCompressedLockWord = static_cast<TR_J9VMBase*>(comp->fe())->generateCompressedLockWord();121122const bool is32BitLock = comp->target().is32Bit() || generateCompressedLockWord;123const bool is32BitLoad = J9::DataType::getSize(loadNode->getDataType()) == 4;124125bool lockRegisterRequiresZeroExt = false;126bool loadRegisterRequiresZeroExt = false;127bool loadRegisterRequiresSignExt = false;128129if (is32BitLock && is32BitLoad)130{131generateSSFInstruction(cg, TR::InstOpCode::LPD, node, registerPair, loadMemoryReference, lockMemoryReference);132133loadRegisterRequiresZeroExt = loadNode->isZeroExtendedTo64BitAtSource();134loadRegisterRequiresSignExt = loadNode->isSignExtendedTo64BitAtSource();135}136else137{138// LPDG requires memory references to be aligned to a double-word boundary139TR::MemoryReference* alignedLockMemoryReference = lockMemoryReference;140TR::MemoryReference* alignedLoadMemoryReference = loadMemoryReference;141142bool lockRegisterRequiresShift = false;143bool loadRegisterRequiresShift = false;144145if (is32BitLock)146{147if ((lockWordOffset & 7) == 0)148{149lockRegisterRequiresShift = true;150}151else152{153// This is because we must use LPDG to load a 32-bit value using displacement -4154TR_ASSERT_SAFE_FATAL((lockWordOffset & 3) == 0, "Lockword must be aligned on a word boundary\n");155156lockRegisterRequiresZeroExt = true;157alignedLockMemoryReference = generateS390MemoryReference(*lockMemoryReference, -4, cg);158}159}160else161{162TR_ASSERT_SAFE_FATAL((lockWordOffset & 7) == 0, "Lockword must be aligned on a double-word boundary\n");163}164165if (is32BitLoad)166{167if ((loadMemoryReference->getOffset() & 7) == 0)168{169loadRegisterRequiresShift = true;170}171else172{173// This is because we must use LPDG to load a 32-bit value using displacement -4174TR_ASSERT_SAFE_FATAL((loadMemoryReference->getOffset() & 3) == 0, "Field must be aligned on a word boundary\n");175176loadRegisterRequiresZeroExt = loadNode->isZeroExtendedTo64BitAtSource();177loadRegisterRequiresSignExt = loadNode->isSignExtendedTo64BitAtSource();178179alignedLoadMemoryReference = generateS390MemoryReference(*loadMemoryReference, -4, cg);180}181}182else183{184TR_ASSERT_SAFE_FATAL((loadMemoryReference->getOffset() & 7) == 0, "Field must be aligned on a double-word boundary\n");185}186187generateSSFInstruction(cg, TR::InstOpCode::LPDG, node, registerPair, alignedLoadMemoryReference, alignedLockMemoryReference);188189if (lockRegisterRequiresShift)190{191generateRSInstruction(cg, TR::InstOpCode::SRLG, node, lockRegister, lockRegister, 32);192}193194if (loadRegisterRequiresShift)195{196generateRSInstruction(cg, TR::InstOpCode::SRLG, node, loadRegister, loadRegister, 32);197}198}199200if (lockRegisterRequiresZeroExt)201{202generateRREInstruction(cg, TR::InstOpCode::LLGFR, node, lockRegister, lockRegister);203}204205if (loadRegisterRequiresZeroExt)206{207generateRREInstruction(cg, TR::InstOpCode::LLGFR, node, loadRegister, loadRegister);208}209210if (loadRegisterRequiresSignExt)211{212generateRRInstruction(cg, TR::InstOpCode::LGFR, node, loadRegister, loadRegister);213}214215// LPD sets condition code 3 if the pair as not loaded by means of interlocked fetch216generateS390BranchInstruction(cg, TR::InstOpCode::BRC, TR::InstOpCode::COND_BO, node, slowPathLabel);217218// Mask out the recursion and lock reservation bits219generateRIInstruction(cg, TR::InstOpCode::NILL, node, lockRegister, ~(OBJECT_HEADER_LOCK_RECURSION_MASK | OBJECT_HEADER_LOCK_RESERVED));220221TR::Register *metaDataReg = cg->getMethodMetaDataRealRegister();222if (is32BitLock && is32BitLoad)223{224// Now check if we loaded the lock of the current thread225generateS390CompareAndBranchInstruction(cg, TR::InstOpCode::CR, node, lockRegister, metaDataReg, TR::InstOpCode::COND_BE, mergeLabel, false);226227// Lock could be free as well228generateS390CompareAndBranchInstruction(cg, TR::InstOpCode::C, node, lockRegister, 0, TR::InstOpCode::COND_BE, mergeLabel, false);229}230else231{232// Now check if we loaded the lock of the current thread233generateS390CompareAndBranchInstruction(cg, TR::InstOpCode::CGR, node, lockRegister, metaDataReg, TR::InstOpCode::COND_BE, mergeLabel, false);234235// Lock could be free as well236generateS390CompareAndBranchInstruction(cg, TR::InstOpCode::CG, node, lockRegister, 0, TR::InstOpCode::COND_BE, mergeLabel, false);237}238239generateS390BranchInstruction(cg, TR::InstOpCode::BRC, TR::InstOpCode::COND_BRC, node, slowPathLabel);240241generateS390LabelInstruction(cg, TR::InstOpCode::label, node, mergeLabel, conditions);242243cg->decReferenceCount(synchronizedObjectNode);244cg->decReferenceCount(loadNode);245cg->decReferenceCount(lockWordOffsetNode);246cg->recursivelyDecReferenceCount(monentSymbolReferenceNode);247cg->recursivelyDecReferenceCount(monexitSymbolReferenceNode);248cg->stopUsingRegister(lockRegister);249cg->stopUsingRegister(registerPair);250}251252bool253ReduceSynchronizedFieldLoad::perform()254{255bool transformed = false;256257if (cg->comp()->target().cpu.isAtLeast(OMR_PROCESSOR_S390_Z196))258{259if (!cg->comp()->getOption(TR_DisableSynchronizedFieldLoad) && cg->comp()->getMethodSymbol()->mayContainMonitors())260{261traceMsg(cg->comp(), "Performing ReduceSynchronizedFieldLoad\n");262263for (TR::TreeTopOrderExtendedBlockIterator iter(cg->comp()); iter.getFirst() != NULL; ++iter)264{265transformed = performOnTreeTops(iter.getFirst()->getEntry(), iter.getLast()->getExit());266}267}268}269270return transformed;271}272273bool274ReduceSynchronizedFieldLoad::performOnTreeTops(TR::TreeTop* startTreeTop, TR::TreeTop* endTreeTop)275{276TR::Compilation *comp = cg->comp();277bool transformed = false;278279for (TR::TreeTopIterator iter(startTreeTop, comp); iter != endTreeTop; ++iter)280{281if (iter.currentNode()->getOpCodeValue() == TR::monent ||282iter.currentNode()->getOpCodeValue() == TR::treetop && iter.currentNode()->getFirstChild()->getOpCodeValue() == TR::monent)283{284TR::TreeTop* monentTreeTop = iter.currentTree();285TR::Node* monentNode = iter.currentNode()->getOpCodeValue() == TR::monent ?286iter.currentNode() :287iter.currentNode()->getFirstChild();288// Locking on value types or value based classes is prohibited by the spec.,289// so this optimization can only be performed if we are certain (at compile time)290// the locking object is not a value type or value based291if (cg->isMonitorValueBasedOrValueType(monentNode) != TR_no)292continue;293if (comp->getOption(TR_TraceCG))294{295traceMsg(comp, "Found monent [%p]\n", monentNode);296}297298for (++iter; iter != endTreeTop; ++iter)299{300if (iter.currentNode()->getOpCodeValue() == TR::monexit ||301iter.currentNode()->getOpCodeValue() == TR::treetop && iter.currentNode()->getFirstChild()->getOpCodeValue() == TR::monexit)302{303TR::TreeTop* monexitTreeTop = iter.currentTree();304TR::Node* monexitNode = iter.currentNode()->getOpCodeValue() == TR::monexit ?305iter.currentNode() :306iter.currentNode()->getFirstChild();307if (comp->getOption(TR_TraceCG))308{309traceMsg(comp, "Found monexit [%p]\n", monexitNode);310}311312TR::Node* synchronizedObjectNode = monentNode->getFirstChild();313314if (synchronizedObjectNode == monexitNode->getFirstChild())315{316if (comp->getOption(TR_TraceCG))317{318traceMsg(comp, "Children of monent and monexit are synchronizing on the same object\n", monexitNode);319}320321TR::Node* loadNode = findLoadInSynchornizedRegion(startTreeTop, endTreeTop, monentTreeTop, monexitTreeTop, synchronizedObjectNode);322323if (loadNode != NULL)324{325// Disallow this optimization for 64-bit loads on 31-bit JVM due to register pairs326if (comp->target().is32Bit() && J9::DataType::getSize(loadNode->getDataType()) == 8)327{328TR::DebugCounter::incStaticDebugCounter(comp, TR::DebugCounter::debugCounterName(comp, "codegen/z/ReduceSynchronizedFieldLoad/failure/31-bit-register-pairs/%s", comp->signature()));329330break;331}332333// When concurrent scavenge is enabled we need to load the object reference using a read barrier however334// there is no guarded load alternative for the LPD instruction. As such this optimization cannot be carried335// out for object reference loads under concurrent scavenge.336if (TR::Compiler->om.readBarrierType() != gc_modron_readbar_none && loadNode->getDataType().isAddress())337{338TR::DebugCounter::incStaticDebugCounter(comp, TR::DebugCounter::debugCounterName(comp, "codegen/z/ReduceSynchronizedFieldLoad/failure/read-barrier/%s", comp->signature()));339340break;341}342343int32_t lockWordOffset = static_cast<TR_J9VMBase*>(comp->fe())->getByteOffsetToLockword(static_cast<TR_OpaqueClassBlock*>(cg->getMonClass(monentNode)));344345if (comp->getOption(TR_TraceCG))346{347traceMsg(comp, "Lock word offset = %d\n", lockWordOffset);348}349350// LPD(G) is an SSF instruction with a 12-bit displacement351if (lockWordOffset > 0 && lockWordOffset < 4096)352{353if (performTransformation(comp, "%sReplacing monent [%p] - monexit [%p] synchronized region on load [%p] with fabricated call\n", OPT_DETAILS, monentNode, monexitNode, loadNode))354{355transformed = true;356357// Fabricate a special codegen inlined method call symbol reference358TR::SymbolReference* methodSymRef = comp->getSymRefTab()->findOrCreateCodeGenInlinedHelper(TR::SymbolReferenceTable::synchronizedFieldLoadSymbol);359360TR::Node* callNode = TR::Node::createWithSymRef(loadNode, TR::call, 5, methodSymRef);361362callNode->setAndIncChild(0, synchronizedObjectNode);363callNode->setAndIncChild(1, loadNode);364365TR::Node* lockWordOffsetNode = TR::Node::iconst(loadNode, lockWordOffset);366367callNode->setAndIncChild(2, lockWordOffsetNode);368369TR::Node* monentSymbolReferenceNode = TR::Node::createWithSymRef(loadNode, TR::call, 1, synchronizedObjectNode, monentNode->getSymbolReference());370TR::Node* monexitSymbolReferenceNode = TR::Node::createWithSymRef(loadNode, TR::call, 1, synchronizedObjectNode, monexitNode->getSymbolReference());371372callNode->setAndIncChild(3, monentSymbolReferenceNode);373callNode->setAndIncChild(4, monexitSymbolReferenceNode);374375TR::Node* treeTopNode = TR::Node::create(loadNode, TR::treetop, 1, callNode);376377TR::TreeTop* callTreeTop = TR::TreeTop::create(comp, treeTopNode);378379// Insert fabricated call treetop380monentTreeTop->insertBefore(callTreeTop);381382// Remove the monitor region383monentTreeTop->unlink(true);384monexitTreeTop->unlink(true);385386TR::DebugCounter::incStaticDebugCounter(comp, TR::DebugCounter::debugCounterName(comp, "codegen/z/ReduceSynchronizedFieldLoad/success/%s", comp->signature()));387}388}389else390{391TR::DebugCounter::incStaticDebugCounter(comp, TR::DebugCounter::debugCounterName(comp, "codegen/z/ReduceSynchronizedFieldLoad/failure/lockword-out-of-bounds/%s", comp->signature()));392}393}394else395{396TR::DebugCounter::incStaticDebugCounter(comp, TR::DebugCounter::debugCounterName(comp, "codegen/z/ReduceSynchronizedFieldLoad/failure/load-not-found/%s", comp->signature()));397}398}399else400{401TR::DebugCounter::incStaticDebugCounter(comp, TR::DebugCounter::debugCounterName(comp, "codegen/z/ReduceSynchronizedFieldLoad/failure/monexit-synchronized-object-mismatch/%s", comp->signature()));402}403404break;405}406}407408if (iter == endTreeTop)409{410break;411}412}413}414415return transformed;416}417418TR::Node*419ReduceSynchronizedFieldLoad::findLoadInSynchornizedRegion(TR::TreeTop* startTreeTop, TR::TreeTop* endTreeTop, TR::TreeTop* monentTreeTop, TR::TreeTop* monexitTreeTop, TR::Node* synchronizedObjectNode)420{421TR::Compilation *comp = cg->comp();422TR::PreorderNodeIterator iter(startTreeTop, comp);423424// First iterate through all the nodes from the start treetop until we reach the monitor provided so that all nodes425// seen thus far would have already been visited, and hence we will not recurse into them in the subsequent for loop426// since a reference was already seen. This enables us to carry out the reduce synchronized field load optimization427// even if there are side-effect nodes within the monitored region - as long as those side-effect nodes have been428// evaluated outside of the monitored region.429for (; iter != monentTreeTop->getNextTreeTop(); ++iter)430{431TR::Node* currentNode = iter.currentNode();432433if (comp->getOption(TR_TraceCG))434{435traceMsg(comp, "Iterating node [%p] outside the monitored region\n", currentNode);436}437}438439TR::Node* loadNode = NULL;440441for (; iter != monexitTreeTop; ++iter)442{443TR::Node* currentNode = iter.currentNode();444445if (comp->getOption(TR_TraceCG))446{447traceMsg(comp, "Iterating node [%p] inside the monitored region\n", currentNode);448}449450TR::ILOpCode opcode = currentNode->getOpCode();451452if (opcode.hasSymbolReference() || opcode.isBranch())453{454if (loadNode == NULL &&455opcode.isLoadIndirect() && (opcode.isRef() || opcode.isInt() || opcode.isLong()) &&456currentNode->getFirstChild() == synchronizedObjectNode)457{458if (comp->getOption(TR_TraceCG))459{460traceMsg(comp, "Found load node [%p]\n", currentNode);461}462463loadNode = currentNode;464}465else466{467if (comp->getOption(TR_TraceCG))468{469traceMsg(comp, "Found sideeffect node [%p] within the monitored region\n", currentNode);470}471472loadNode = NULL;473474break;475}476}477}478479return loadNode;480}481482483