Path: blob/jdk8u272-b10-aarch32-20201026/hotspot/src/share/vm/opto/gcm.cpp
83404 views
/*1* Copyright (c) 1997, 2018, Oracle and/or its affiliates. All rights reserved.2* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.3*4* This code is free software; you can redistribute it and/or modify it5* under the terms of the GNU General Public License version 2 only, as6* published by the Free Software Foundation.7*8* This code is distributed in the hope that it will be useful, but WITHOUT9* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or10* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License11* version 2 for more details (a copy is included in the LICENSE file that12* accompanied this code).13*14* You should have received a copy of the GNU General Public License version15* 2 along with this work; if not, write to the Free Software Foundation,16* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.17*18* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA19* or visit www.oracle.com if you need additional information or have any20* questions.21*22*/2324#include "precompiled.hpp"25#include "libadt/vectset.hpp"26#include "memory/allocation.inline.hpp"27#include "opto/block.hpp"28#include "opto/c2compiler.hpp"29#include "opto/callnode.hpp"30#include "opto/cfgnode.hpp"31#include "opto/machnode.hpp"32#include "opto/opcodes.hpp"33#include "opto/phaseX.hpp"34#include "opto/rootnode.hpp"35#include "opto/runtime.hpp"36#include "runtime/deoptimization.hpp"37#if defined AD_MD_HPP38# include AD_MD_HPP39#elif defined TARGET_ARCH_MODEL_x86_3240# include "adfiles/ad_x86_32.hpp"41#elif defined TARGET_ARCH_MODEL_x86_6442# include "adfiles/ad_x86_64.hpp"43#elif defined TARGET_ARCH_MODEL_aarch6444# include "adfiles/ad_aarch64.hpp"45#elif defined TARGET_ARCH_MODEL_sparc46# include "adfiles/ad_sparc.hpp"47#elif defined TARGET_ARCH_MODEL_zero48# include "adfiles/ad_zero.hpp"49#elif defined TARGET_ARCH_MODEL_ppc_6450# include "adfiles/ad_ppc_64.hpp"51#elif defined TARGET_ARCH_MODEL_aarch3252# include "adfiles/ad_aarch32.hpp"53#endif545556// Portions of code courtesy of Clifford Click5758// Optimization - Graph Style5960// To avoid float value underflow61#define MIN_BLOCK_FREQUENCY 1.e-35f6263//----------------------------schedule_node_into_block-------------------------64// Insert node n into block b. Look for projections of n and make sure they65// are in b also.66void PhaseCFG::schedule_node_into_block( Node *n, Block *b ) {67// Set basic block of n, Add n to b,68map_node_to_block(n, b);69b->add_inst(n);7071// After Matching, nearly any old Node may have projections trailing it.72// These are usually machine-dependent flags. In any case, they might73// float to another block below this one. Move them up.74for (DUIterator_Fast imax, i = n->fast_outs(imax); i < imax; i++) {75Node* use = n->fast_out(i);76if (use->is_Proj()) {77Block* buse = get_block_for_node(use);78if (buse != b) { // In wrong block?79if (buse != NULL) {80buse->find_remove(use); // Remove from wrong block81}82map_node_to_block(use, b);83b->add_inst(use);84}85}86}87}8889//----------------------------replace_block_proj_ctrl-------------------------90// Nodes that have is_block_proj() nodes as their control need to use91// the appropriate Region for their actual block as their control since92// the projection will be in a predecessor block.93void PhaseCFG::replace_block_proj_ctrl( Node *n ) {94const Node *in0 = n->in(0);95assert(in0 != NULL, "Only control-dependent");96const Node *p = in0->is_block_proj();97if (p != NULL && p != n) { // Control from a block projection?98assert(!n->pinned() || n->is_MachConstantBase(), "only pinned MachConstantBase node is expected here");99// Find trailing Region100Block *pb = get_block_for_node(in0); // Block-projection already has basic block101uint j = 0;102if (pb->_num_succs != 1) { // More then 1 successor?103// Search for successor104uint max = pb->number_of_nodes();105assert( max > 1, "" );106uint start = max - pb->_num_succs;107// Find which output path belongs to projection108for (j = start; j < max; j++) {109if( pb->get_node(j) == in0 )110break;111}112assert( j < max, "must find" );113// Change control to match head of successor basic block114j -= start;115}116n->set_req(0, pb->_succs[j]->head());117}118}119120121//------------------------------schedule_pinned_nodes--------------------------122// Set the basic block for Nodes pinned into blocks123void PhaseCFG::schedule_pinned_nodes(VectorSet &visited) {124// Allocate node stack of size C->live_nodes()+8 to avoid frequent realloc125GrowableArray <Node *> spstack(C->live_nodes() + 8);126spstack.push(_root);127while (spstack.is_nonempty()) {128Node* node = spstack.pop();129if (!visited.test_set(node->_idx)) { // Test node and flag it as visited130if (node->pinned() && !has_block(node)) { // Pinned? Nail it down!131assert(node->in(0), "pinned Node must have Control");132// Before setting block replace block_proj control edge133replace_block_proj_ctrl(node);134Node* input = node->in(0);135while (!input->is_block_start()) {136input = input->in(0);137}138Block* block = get_block_for_node(input); // Basic block of controlling input139schedule_node_into_block(node, block);140}141142// process all inputs that are non NULL143for (int i = node->req() - 1; i >= 0; --i) {144if (node->in(i) != NULL) {145spstack.push(node->in(i));146}147}148}149}150}151152#ifdef ASSERT153// Assert that new input b2 is dominated by all previous inputs.154// Check this by by seeing that it is dominated by b1, the deepest155// input observed until b2.156static void assert_dom(Block* b1, Block* b2, Node* n, const PhaseCFG* cfg) {157if (b1 == NULL) return;158assert(b1->_dom_depth < b2->_dom_depth, "sanity");159Block* tmp = b2;160while (tmp != b1 && tmp != NULL) {161tmp = tmp->_idom;162}163if (tmp != b1) {164// Detected an unschedulable graph. Print some nice stuff and die.165tty->print_cr("!!! Unschedulable graph !!!");166for (uint j=0; j<n->len(); j++) { // For all inputs167Node* inn = n->in(j); // Get input168if (inn == NULL) continue; // Ignore NULL, missing inputs169Block* inb = cfg->get_block_for_node(inn);170tty->print("B%d idom=B%d depth=%2d ",inb->_pre_order,171inb->_idom ? inb->_idom->_pre_order : 0, inb->_dom_depth);172inn->dump();173}174tty->print("Failing node: ");175n->dump();176assert(false, "unscheduable graph");177}178}179#endif180181static Block* find_deepest_input(Node* n, const PhaseCFG* cfg) {182// Find the last input dominated by all other inputs.183Block* deepb = NULL; // Deepest block so far184int deepb_dom_depth = 0;185for (uint k = 0; k < n->len(); k++) { // For all inputs186Node* inn = n->in(k); // Get input187if (inn == NULL) continue; // Ignore NULL, missing inputs188Block* inb = cfg->get_block_for_node(inn);189assert(inb != NULL, "must already have scheduled this input");190if (deepb_dom_depth < (int) inb->_dom_depth) {191// The new inb must be dominated by the previous deepb.192// The various inputs must be linearly ordered in the dom193// tree, or else there will not be a unique deepest block.194DEBUG_ONLY(assert_dom(deepb, inb, n, cfg));195deepb = inb; // Save deepest block196deepb_dom_depth = deepb->_dom_depth;197}198}199assert(deepb != NULL, "must be at least one input to n");200return deepb;201}202203204//------------------------------schedule_early---------------------------------205// Find the earliest Block any instruction can be placed in. Some instructions206// are pinned into Blocks. Unpinned instructions can appear in last block in207// which all their inputs occur.208bool PhaseCFG::schedule_early(VectorSet &visited, Node_List &roots) {209// Allocate stack with enough space to avoid frequent realloc210Node_Stack nstack(roots.Size() + 8);211// _root will be processed among C->top() inputs212roots.push(C->top());213visited.set(C->top()->_idx);214215while (roots.size() != 0) {216// Use local variables nstack_top_n & nstack_top_i to cache values217// on stack's top.218Node* parent_node = roots.pop();219uint input_index = 0;220221while (true) {222if (input_index == 0) {223// Fixup some control. Constants without control get attached224// to root and nodes that use is_block_proj() nodes should be attached225// to the region that starts their block.226const Node* control_input = parent_node->in(0);227if (control_input != NULL) {228replace_block_proj_ctrl(parent_node);229} else {230// Is a constant with NO inputs?231if (parent_node->req() == 1) {232parent_node->set_req(0, _root);233}234}235}236237// First, visit all inputs and force them to get a block. If an238// input is already in a block we quit following inputs (to avoid239// cycles). Instead we put that Node on a worklist to be handled240// later (since IT'S inputs may not have a block yet).241242// Assume all n's inputs will be processed243bool done = true;244245while (input_index < parent_node->len()) {246Node* in = parent_node->in(input_index++);247if (in == NULL) {248continue;249}250251int is_visited = visited.test_set(in->_idx);252if (!has_block(in)) {253if (is_visited) {254assert(false, "graph should be schedulable");255return false;256}257// Save parent node and next input's index.258nstack.push(parent_node, input_index);259// Process current input now.260parent_node = in;261input_index = 0;262// Not all n's inputs processed.263done = false;264break;265} else if (!is_visited) {266// Visit this guy later, using worklist267roots.push(in);268}269}270271if (done) {272// All of n's inputs have been processed, complete post-processing.273274// Some instructions are pinned into a block. These include Region,275// Phi, Start, Return, and other control-dependent instructions and276// any projections which depend on them.277if (!parent_node->pinned()) {278// Set earliest legal block.279Block* earliest_block = find_deepest_input(parent_node, this);280map_node_to_block(parent_node, earliest_block);281} else {282assert(get_block_for_node(parent_node) == get_block_for_node(parent_node->in(0)), "Pinned Node should be at the same block as its control edge");283}284285if (nstack.is_empty()) {286// Finished all nodes on stack.287// Process next node on the worklist 'roots'.288break;289}290// Get saved parent node and next input's index.291parent_node = nstack.node();292input_index = nstack.index();293nstack.pop();294}295}296}297return true;298}299300//------------------------------dom_lca----------------------------------------301// Find least common ancestor in dominator tree302// LCA is a current notion of LCA, to be raised above 'this'.303// As a convenient boundary condition, return 'this' if LCA is NULL.304// Find the LCA of those two nodes.305Block* Block::dom_lca(Block* LCA) {306if (LCA == NULL || LCA == this) return this;307308Block* anc = this;309while (anc->_dom_depth > LCA->_dom_depth)310anc = anc->_idom; // Walk up till anc is as high as LCA311312while (LCA->_dom_depth > anc->_dom_depth)313LCA = LCA->_idom; // Walk up till LCA is as high as anc314315while (LCA != anc) { // Walk both up till they are the same316LCA = LCA->_idom;317anc = anc->_idom;318}319320return LCA;321}322323//--------------------------raise_LCA_above_use--------------------------------324// We are placing a definition, and have been given a def->use edge.325// The definition must dominate the use, so move the LCA upward in the326// dominator tree to dominate the use. If the use is a phi, adjust327// the LCA only with the phi input paths which actually use this def.328static Block* raise_LCA_above_use(Block* LCA, Node* use, Node* def, const PhaseCFG* cfg) {329Block* buse = cfg->get_block_for_node(use);330if (buse == NULL) return LCA; // Unused killing Projs have no use block331if (!use->is_Phi()) return buse->dom_lca(LCA);332uint pmax = use->req(); // Number of Phi inputs333// Why does not this loop just break after finding the matching input to334// the Phi? Well...it's like this. I do not have true def-use/use-def335// chains. Means I cannot distinguish, from the def-use direction, which336// of many use-defs lead from the same use to the same def. That is, this337// Phi might have several uses of the same def. Each use appears in a338// different predecessor block. But when I enter here, I cannot distinguish339// which use-def edge I should find the predecessor block for. So I find340// them all. Means I do a little extra work if a Phi uses the same value341// more than once.342for (uint j=1; j<pmax; j++) { // For all inputs343if (use->in(j) == def) { // Found matching input?344Block* pred = cfg->get_block_for_node(buse->pred(j));345LCA = pred->dom_lca(LCA);346}347}348return LCA;349}350351//----------------------------raise_LCA_above_marks----------------------------352// Return a new LCA that dominates LCA and any of its marked predecessors.353// Search all my parents up to 'early' (exclusive), looking for predecessors354// which are marked with the given index. Return the LCA (in the dom tree)355// of all marked blocks. If there are none marked, return the original356// LCA.357static Block* raise_LCA_above_marks(Block* LCA, node_idx_t mark, Block* early, const PhaseCFG* cfg) {358Block_List worklist;359worklist.push(LCA);360while (worklist.size() > 0) {361Block* mid = worklist.pop();362if (mid == early) continue; // stop searching here363364// Test and set the visited bit.365if (mid->raise_LCA_visited() == mark) continue; // already visited366367// Don't process the current LCA, otherwise the search may terminate early368if (mid != LCA && mid->raise_LCA_mark() == mark) {369// Raise the LCA.370LCA = mid->dom_lca(LCA);371if (LCA == early) break; // stop searching everywhere372assert(early->dominates(LCA), "early is high enough");373// Resume searching at that point, skipping intermediate levels.374worklist.push(LCA);375if (LCA == mid)376continue; // Don't mark as visited to avoid early termination.377} else {378// Keep searching through this block's predecessors.379for (uint j = 1, jmax = mid->num_preds(); j < jmax; j++) {380Block* mid_parent = cfg->get_block_for_node(mid->pred(j));381worklist.push(mid_parent);382}383}384mid->set_raise_LCA_visited(mark);385}386return LCA;387}388389//--------------------------memory_early_block--------------------------------390// This is a variation of find_deepest_input, the heart of schedule_early.391// Find the "early" block for a load, if we considered only memory and392// address inputs, that is, if other data inputs were ignored.393//394// Because a subset of edges are considered, the resulting block will395// be earlier (at a shallower dom_depth) than the true schedule_early396// point of the node. We compute this earlier block as a more permissive397// site for anti-dependency insertion, but only if subsume_loads is enabled.398static Block* memory_early_block(Node* load, Block* early, const PhaseCFG* cfg) {399Node* base;400Node* index;401Node* store = load->in(MemNode::Memory);402load->as_Mach()->memory_inputs(base, index);403404assert(base != NodeSentinel && index != NodeSentinel,405"unexpected base/index inputs");406407Node* mem_inputs[4];408int mem_inputs_length = 0;409if (base != NULL) mem_inputs[mem_inputs_length++] = base;410if (index != NULL) mem_inputs[mem_inputs_length++] = index;411if (store != NULL) mem_inputs[mem_inputs_length++] = store;412413// In the comparision below, add one to account for the control input,414// which may be null, but always takes up a spot in the in array.415if (mem_inputs_length + 1 < (int) load->req()) {416// This "load" has more inputs than just the memory, base and index inputs.417// For purposes of checking anti-dependences, we need to start418// from the early block of only the address portion of the instruction,419// and ignore other blocks that may have factored into the wider420// schedule_early calculation.421if (load->in(0) != NULL) mem_inputs[mem_inputs_length++] = load->in(0);422423Block* deepb = NULL; // Deepest block so far424int deepb_dom_depth = 0;425for (int i = 0; i < mem_inputs_length; i++) {426Block* inb = cfg->get_block_for_node(mem_inputs[i]);427if (deepb_dom_depth < (int) inb->_dom_depth) {428// The new inb must be dominated by the previous deepb.429// The various inputs must be linearly ordered in the dom430// tree, or else there will not be a unique deepest block.431DEBUG_ONLY(assert_dom(deepb, inb, load, cfg));432deepb = inb; // Save deepest block433deepb_dom_depth = deepb->_dom_depth;434}435}436early = deepb;437}438439return early;440}441442//--------------------------insert_anti_dependences---------------------------443// A load may need to witness memory that nearby stores can overwrite.444// For each nearby store, either insert an "anti-dependence" edge445// from the load to the store, or else move LCA upward to force the446// load to (eventually) be scheduled in a block above the store.447//448// Do not add edges to stores on distinct control-flow paths;449// only add edges to stores which might interfere.450//451// Return the (updated) LCA. There will not be any possibly interfering452// store between the load's "early block" and the updated LCA.453// Any stores in the updated LCA will have new precedence edges454// back to the load. The caller is expected to schedule the load455// in the LCA, in which case the precedence edges will make LCM456// preserve anti-dependences. The caller may also hoist the load457// above the LCA, if it is not the early block.458Block* PhaseCFG::insert_anti_dependences(Block* LCA, Node* load, bool verify) {459assert(load->needs_anti_dependence_check(), "must be a load of some sort");460assert(LCA != NULL, "");461DEBUG_ONLY(Block* LCA_orig = LCA);462463// Compute the alias index. Loads and stores with different alias indices464// do not need anti-dependence edges.465uint load_alias_idx = C->get_alias_index(load->adr_type());466#ifdef ASSERT467if (load_alias_idx == Compile::AliasIdxBot && C->AliasLevel() > 0 &&468(PrintOpto || VerifyAliases ||469PrintMiscellaneous && (WizardMode || Verbose))) {470// Load nodes should not consume all of memory.471// Reporting a bottom type indicates a bug in adlc.472// If some particular type of node validly consumes all of memory,473// sharpen the preceding "if" to exclude it, so we can catch bugs here.474tty->print_cr("*** Possible Anti-Dependence Bug: Load consumes all of memory.");475load->dump(2);476if (VerifyAliases) assert(load_alias_idx != Compile::AliasIdxBot, "");477}478#endif479assert(load_alias_idx || (load->is_Mach() && load->as_Mach()->ideal_Opcode() == Op_StrComp),480"String compare is only known 'load' that does not conflict with any stores");481assert(load_alias_idx || (load->is_Mach() && load->as_Mach()->ideal_Opcode() == Op_StrEquals),482"String equals is a 'load' that does not conflict with any stores");483assert(load_alias_idx || (load->is_Mach() && load->as_Mach()->ideal_Opcode() == Op_StrIndexOf),484"String indexOf is a 'load' that does not conflict with any stores");485assert(load_alias_idx || (load->is_Mach() && load->as_Mach()->ideal_Opcode() == Op_AryEq),486"Arrays equals is a 'load' that do not conflict with any stores");487488if (!C->alias_type(load_alias_idx)->is_rewritable()) {489// It is impossible to spoil this load by putting stores before it,490// because we know that the stores will never update the value491// which 'load' must witness.492return LCA;493}494495node_idx_t load_index = load->_idx;496497// Note the earliest legal placement of 'load', as determined by498// by the unique point in the dom tree where all memory effects499// and other inputs are first available. (Computed by schedule_early.)500// For normal loads, 'early' is the shallowest place (dom graph wise)501// to look for anti-deps between this load and any store.502Block* early = get_block_for_node(load);503504// If we are subsuming loads, compute an "early" block that only considers505// memory or address inputs. This block may be different than the506// schedule_early block in that it could be at an even shallower depth in the507// dominator tree, and allow for a broader discovery of anti-dependences.508if (C->subsume_loads()) {509early = memory_early_block(load, early, this);510}511512ResourceArea *area = Thread::current()->resource_area();513Node_List worklist_mem(area); // prior memory state to store514Node_List worklist_store(area); // possible-def to explore515Node_List worklist_visited(area); // visited mergemem nodes516Node_List non_early_stores(area); // all relevant stores outside of early517bool must_raise_LCA = false;518519#ifdef TRACK_PHI_INPUTS520// %%% This extra checking fails because MergeMem nodes are not GVNed.521// Provide "phi_inputs" to check if every input to a PhiNode is from the522// original memory state. This indicates a PhiNode for which should not523// prevent the load from sinking. For such a block, set_raise_LCA_mark524// may be overly conservative.525// Mechanism: count inputs seen for each Phi encountered in worklist_store.526DEBUG_ONLY(GrowableArray<uint> phi_inputs(area, C->unique(),0,0));527#endif528529// 'load' uses some memory state; look for users of the same state.530// Recurse through MergeMem nodes to the stores that use them.531532// Each of these stores is a possible definition of memory533// that 'load' needs to use. We need to force 'load'534// to occur before each such store. When the store is in535// the same block as 'load', we insert an anti-dependence536// edge load->store.537538// The relevant stores "nearby" the load consist of a tree rooted539// at initial_mem, with internal nodes of type MergeMem.540// Therefore, the branches visited by the worklist are of this form:541// initial_mem -> (MergeMem ->)* store542// The anti-dependence constraints apply only to the fringe of this tree.543544Node* initial_mem = load->in(MemNode::Memory);545worklist_store.push(initial_mem);546worklist_visited.push(initial_mem);547worklist_mem.push(NULL);548while (worklist_store.size() > 0) {549// Examine a nearby store to see if it might interfere with our load.550Node* mem = worklist_mem.pop();551Node* store = worklist_store.pop();552uint op = store->Opcode();553554// MergeMems do not directly have anti-deps.555// Treat them as internal nodes in a forward tree of memory states,556// the leaves of which are each a 'possible-def'.557if (store == initial_mem // root (exclusive) of tree we are searching558|| op == Op_MergeMem // internal node of tree we are searching559) {560mem = store; // It's not a possibly interfering store.561if (store == initial_mem)562initial_mem = NULL; // only process initial memory once563564for (DUIterator_Fast imax, i = mem->fast_outs(imax); i < imax; i++) {565store = mem->fast_out(i);566if (store->is_MergeMem()) {567// Be sure we don't get into combinatorial problems.568// (Allow phis to be repeated; they can merge two relevant states.)569uint j = worklist_visited.size();570for (; j > 0; j--) {571if (worklist_visited.at(j-1) == store) break;572}573if (j > 0) continue; // already on work list; do not repeat574worklist_visited.push(store);575}576worklist_mem.push(mem);577worklist_store.push(store);578}579continue;580}581582if (op == Op_MachProj || op == Op_Catch) continue;583if (store->needs_anti_dependence_check()) continue; // not really a store584585// Compute the alias index. Loads and stores with different alias586// indices do not need anti-dependence edges. Wide MemBar's are587// anti-dependent on everything (except immutable memories).588const TypePtr* adr_type = store->adr_type();589if (!C->can_alias(adr_type, load_alias_idx)) continue;590591// Most slow-path runtime calls do NOT modify Java memory, but592// they can block and so write Raw memory.593if (store->is_Mach()) {594MachNode* mstore = store->as_Mach();595if (load_alias_idx != Compile::AliasIdxRaw) {596// Check for call into the runtime using the Java calling597// convention (and from there into a wrapper); it has no598// _method. Can't do this optimization for Native calls because599// they CAN write to Java memory.600if (mstore->ideal_Opcode() == Op_CallStaticJava) {601assert(mstore->is_MachSafePoint(), "");602MachSafePointNode* ms = (MachSafePointNode*) mstore;603assert(ms->is_MachCallJava(), "");604MachCallJavaNode* mcj = (MachCallJavaNode*) ms;605if (mcj->_method == NULL) {606// These runtime calls do not write to Java visible memory607// (other than Raw) and so do not require anti-dependence edges.608continue;609}610}611// Same for SafePoints: they read/write Raw but only read otherwise.612// This is basically a workaround for SafePoints only defining control613// instead of control + memory.614if (mstore->ideal_Opcode() == Op_SafePoint)615continue;616} else {617// Some raw memory, such as the load of "top" at an allocation,618// can be control dependent on the previous safepoint. See619// comments in GraphKit::allocate_heap() about control input.620// Inserting an anti-dep between such a safepoint and a use621// creates a cycle, and will cause a subsequent failure in622// local scheduling. (BugId 4919904)623// (%%% How can a control input be a safepoint and not a projection??)624if (mstore->ideal_Opcode() == Op_SafePoint && load->in(0) == mstore)625continue;626}627}628629// Identify a block that the current load must be above,630// or else observe that 'store' is all the way up in the631// earliest legal block for 'load'. In the latter case,632// immediately insert an anti-dependence edge.633Block* store_block = get_block_for_node(store);634assert(store_block != NULL, "unused killing projections skipped above");635636if (store->is_Phi()) {637// 'load' uses memory which is one (or more) of the Phi's inputs.638// It must be scheduled not before the Phi, but rather before639// each of the relevant Phi inputs.640//641// Instead of finding the LCA of all inputs to a Phi that match 'mem',642// we mark each corresponding predecessor block and do a combined643// hoisting operation later (raise_LCA_above_marks).644//645// Do not assert(store_block != early, "Phi merging memory after access")646// PhiNode may be at start of block 'early' with backedge to 'early'647DEBUG_ONLY(bool found_match = false);648for (uint j = PhiNode::Input, jmax = store->req(); j < jmax; j++) {649if (store->in(j) == mem) { // Found matching input?650DEBUG_ONLY(found_match = true);651Block* pred_block = get_block_for_node(store_block->pred(j));652if (pred_block != early) {653// If any predecessor of the Phi matches the load's "early block",654// we do not need a precedence edge between the Phi and 'load'655// since the load will be forced into a block preceding the Phi.656pred_block->set_raise_LCA_mark(load_index);657assert(!LCA_orig->dominates(pred_block) ||658early->dominates(pred_block), "early is high enough");659must_raise_LCA = true;660} else {661// anti-dependent upon PHI pinned below 'early', no edge needed662LCA = early; // but can not schedule below 'early'663}664}665}666assert(found_match, "no worklist bug");667#ifdef TRACK_PHI_INPUTS668#ifdef ASSERT669// This assert asks about correct handling of PhiNodes, which may not670// have all input edges directly from 'mem'. See BugId 4621264671int num_mem_inputs = phi_inputs.at_grow(store->_idx,0) + 1;672// Increment by exactly one even if there are multiple copies of 'mem'673// coming into the phi, because we will run this block several times674// if there are several copies of 'mem'. (That's how DU iterators work.)675phi_inputs.at_put(store->_idx, num_mem_inputs);676assert(PhiNode::Input + num_mem_inputs < store->req(),677"Expect at least one phi input will not be from original memory state");678#endif //ASSERT679#endif //TRACK_PHI_INPUTS680} else if (store_block != early) {681// 'store' is between the current LCA and earliest possible block.682// Label its block, and decide later on how to raise the LCA683// to include the effect on LCA of this store.684// If this store's block gets chosen as the raised LCA, we685// will find him on the non_early_stores list and stick him686// with a precedence edge.687// (But, don't bother if LCA is already raised all the way.)688if (LCA != early) {689store_block->set_raise_LCA_mark(load_index);690must_raise_LCA = true;691non_early_stores.push(store);692}693} else {694// Found a possibly-interfering store in the load's 'early' block.695// This means 'load' cannot sink at all in the dominator tree.696// Add an anti-dep edge, and squeeze 'load' into the highest block.697assert(store != load->in(0), "dependence cycle found");698if (verify) {699assert(store->find_edge(load) != -1, "missing precedence edge");700} else {701store->add_prec(load);702}703LCA = early;704// This turns off the process of gathering non_early_stores.705}706}707// (Worklist is now empty; all nearby stores have been visited.)708709// Finished if 'load' must be scheduled in its 'early' block.710// If we found any stores there, they have already been given711// precedence edges.712if (LCA == early) return LCA;713714// We get here only if there are no possibly-interfering stores715// in the load's 'early' block. Move LCA up above all predecessors716// which contain stores we have noted.717//718// The raised LCA block can be a home to such interfering stores,719// but its predecessors must not contain any such stores.720//721// The raised LCA will be a lower bound for placing the load,722// preventing the load from sinking past any block containing723// a store that may invalidate the memory state required by 'load'.724if (must_raise_LCA)725LCA = raise_LCA_above_marks(LCA, load->_idx, early, this);726if (LCA == early) return LCA;727728// Insert anti-dependence edges from 'load' to each store729// in the non-early LCA block.730// Mine the non_early_stores list for such stores.731if (LCA->raise_LCA_mark() == load_index) {732while (non_early_stores.size() > 0) {733Node* store = non_early_stores.pop();734Block* store_block = get_block_for_node(store);735if (store_block == LCA) {736// add anti_dependence from store to load in its own block737assert(store != load->in(0), "dependence cycle found");738if (verify) {739assert(store->find_edge(load) != -1, "missing precedence edge");740} else {741store->add_prec(load);742}743} else {744assert(store_block->raise_LCA_mark() == load_index, "block was marked");745// Any other stores we found must be either inside the new LCA746// or else outside the original LCA. In the latter case, they747// did not interfere with any use of 'load'.748assert(LCA->dominates(store_block)749|| !LCA_orig->dominates(store_block), "no stray stores");750}751}752}753754// Return the highest block containing stores; any stores755// within that block have been given anti-dependence edges.756return LCA;757}758759// This class is used to iterate backwards over the nodes in the graph.760761class Node_Backward_Iterator {762763private:764Node_Backward_Iterator();765766public:767// Constructor for the iterator768Node_Backward_Iterator(Node *root, VectorSet &visited, Node_List &stack, PhaseCFG &cfg);769770// Postincrement operator to iterate over the nodes771Node *next();772773private:774VectorSet &_visited;775Node_List &_stack;776PhaseCFG &_cfg;777};778779// Constructor for the Node_Backward_Iterator780Node_Backward_Iterator::Node_Backward_Iterator( Node *root, VectorSet &visited, Node_List &stack, PhaseCFG &cfg)781: _visited(visited), _stack(stack), _cfg(cfg) {782// The stack should contain exactly the root783stack.clear();784stack.push(root);785786// Clear the visited bits787visited.Clear();788}789790// Iterator for the Node_Backward_Iterator791Node *Node_Backward_Iterator::next() {792793// If the _stack is empty, then just return NULL: finished.794if ( !_stack.size() )795return NULL;796797// '_stack' is emulating a real _stack. The 'visit-all-users' loop has been798// made stateless, so I do not need to record the index 'i' on my _stack.799// Instead I visit all users each time, scanning for unvisited users.800// I visit unvisited not-anti-dependence users first, then anti-dependent801// children next.802Node *self = _stack.pop();803804// I cycle here when I am entering a deeper level of recursion.805// The key variable 'self' was set prior to jumping here.806while( 1 ) {807808_visited.set(self->_idx);809810// Now schedule all uses as late as possible.811const Node* src = self->is_Proj() ? self->in(0) : self;812uint src_rpo = _cfg.get_block_for_node(src)->_rpo;813814// Schedule all nodes in a post-order visit815Node *unvisited = NULL; // Unvisited anti-dependent Node, if any816817// Scan for unvisited nodes818for (DUIterator_Fast imax, i = self->fast_outs(imax); i < imax; i++) {819// For all uses, schedule late820Node* n = self->fast_out(i); // Use821822// Skip already visited children823if ( _visited.test(n->_idx) )824continue;825826// do not traverse backward control edges827Node *use = n->is_Proj() ? n->in(0) : n;828uint use_rpo = _cfg.get_block_for_node(use)->_rpo;829830if ( use_rpo < src_rpo )831continue;832833// Phi nodes always precede uses in a basic block834if ( use_rpo == src_rpo && use->is_Phi() )835continue;836837unvisited = n; // Found unvisited838839// Check for possible-anti-dependent840if( !n->needs_anti_dependence_check() )841break; // Not visited, not anti-dep; schedule it NOW842}843844// Did I find an unvisited not-anti-dependent Node?845if ( !unvisited )846break; // All done with children; post-visit 'self'847848// Visit the unvisited Node. Contains the obvious push to849// indicate I'm entering a deeper level of recursion. I push the850// old state onto the _stack and set a new state and loop (recurse).851_stack.push(self);852self = unvisited;853} // End recursion loop854855return self;856}857858//------------------------------ComputeLatenciesBackwards----------------------859// Compute the latency of all the instructions.860void PhaseCFG::compute_latencies_backwards(VectorSet &visited, Node_List &stack) {861#ifndef PRODUCT862if (trace_opto_pipelining())863tty->print("\n#---- ComputeLatenciesBackwards ----\n");864#endif865866Node_Backward_Iterator iter((Node *)_root, visited, stack, *this);867Node *n;868869// Walk over all the nodes from last to first870while (n = iter.next()) {871// Set the latency for the definitions of this instruction872partial_latency_of_defs(n);873}874} // end ComputeLatenciesBackwards875876//------------------------------partial_latency_of_defs------------------------877// Compute the latency impact of this node on all defs. This computes878// a number that increases as we approach the beginning of the routine.879void PhaseCFG::partial_latency_of_defs(Node *n) {880// Set the latency for this instruction881#ifndef PRODUCT882if (trace_opto_pipelining()) {883tty->print("# latency_to_inputs: node_latency[%d] = %d for node", n->_idx, get_latency_for_node(n));884dump();885}886#endif887888if (n->is_Proj()) {889n = n->in(0);890}891892if (n->is_Root()) {893return;894}895896uint nlen = n->len();897uint use_latency = get_latency_for_node(n);898uint use_pre_order = get_block_for_node(n)->_pre_order;899900for (uint j = 0; j < nlen; j++) {901Node *def = n->in(j);902903if (!def || def == n) {904continue;905}906907// Walk backwards thru projections908if (def->is_Proj()) {909def = def->in(0);910}911912#ifndef PRODUCT913if (trace_opto_pipelining()) {914tty->print("# in(%2d): ", j);915def->dump();916}917#endif918919// If the defining block is not known, assume it is ok920Block *def_block = get_block_for_node(def);921uint def_pre_order = def_block ? def_block->_pre_order : 0;922923if ((use_pre_order < def_pre_order) || (use_pre_order == def_pre_order && n->is_Phi())) {924continue;925}926927uint delta_latency = n->latency(j);928uint current_latency = delta_latency + use_latency;929930if (get_latency_for_node(def) < current_latency) {931set_latency_for_node(def, current_latency);932}933934#ifndef PRODUCT935if (trace_opto_pipelining()) {936tty->print_cr("# %d + edge_latency(%d) == %d -> %d, node_latency[%d] = %d", use_latency, j, delta_latency, current_latency, def->_idx, get_latency_for_node(def));937}938#endif939}940}941942//------------------------------latency_from_use-------------------------------943// Compute the latency of a specific use944int PhaseCFG::latency_from_use(Node *n, const Node *def, Node *use) {945// If self-reference, return no latency946if (use == n || use->is_Root()) {947return 0;948}949950uint def_pre_order = get_block_for_node(def)->_pre_order;951uint latency = 0;952953// If the use is not a projection, then it is simple...954if (!use->is_Proj()) {955#ifndef PRODUCT956if (trace_opto_pipelining()) {957tty->print("# out(): ");958use->dump();959}960#endif961962uint use_pre_order = get_block_for_node(use)->_pre_order;963964if (use_pre_order < def_pre_order)965return 0;966967if (use_pre_order == def_pre_order && use->is_Phi())968return 0;969970uint nlen = use->len();971uint nl = get_latency_for_node(use);972973for ( uint j=0; j<nlen; j++ ) {974if (use->in(j) == n) {975// Change this if we want local latencies976uint ul = use->latency(j);977uint l = ul + nl;978if (latency < l) latency = l;979#ifndef PRODUCT980if (trace_opto_pipelining()) {981tty->print_cr("# %d + edge_latency(%d) == %d -> %d, latency = %d",982nl, j, ul, l, latency);983}984#endif985}986}987} else {988// This is a projection, just grab the latency of the use(s)989for (DUIterator_Fast jmax, j = use->fast_outs(jmax); j < jmax; j++) {990uint l = latency_from_use(use, def, use->fast_out(j));991if (latency < l) latency = l;992}993}994995return latency;996}997998//------------------------------latency_from_uses------------------------------999// Compute the latency of this instruction relative to all of it's uses.1000// This computes a number that increases as we approach the beginning of the1001// routine.1002void PhaseCFG::latency_from_uses(Node *n) {1003// Set the latency for this instruction1004#ifndef PRODUCT1005if (trace_opto_pipelining()) {1006tty->print("# latency_from_outputs: node_latency[%d] = %d for node", n->_idx, get_latency_for_node(n));1007dump();1008}1009#endif1010uint latency=0;1011const Node *def = n->is_Proj() ? n->in(0): n;10121013for (DUIterator_Fast imax, i = n->fast_outs(imax); i < imax; i++) {1014uint l = latency_from_use(n, def, n->fast_out(i));10151016if (latency < l) latency = l;1017}10181019set_latency_for_node(n, latency);1020}10211022//------------------------------hoist_to_cheaper_block-------------------------1023// Pick a block for node self, between early and LCA, that is a cheaper1024// alternative to LCA.1025Block* PhaseCFG::hoist_to_cheaper_block(Block* LCA, Block* early, Node* self) {1026const double delta = 1+PROB_UNLIKELY_MAG(4);1027Block* least = LCA;1028double least_freq = least->_freq;1029uint target = get_latency_for_node(self);1030uint start_latency = get_latency_for_node(LCA->head());1031uint end_latency = get_latency_for_node(LCA->get_node(LCA->end_idx()));1032bool in_latency = (target <= start_latency);1033const Block* root_block = get_block_for_node(_root);10341035// Turn off latency scheduling if scheduling is just plain off1036if (!C->do_scheduling())1037in_latency = true;10381039// Do not hoist (to cover latency) instructions which target a1040// single register. Hoisting stretches the live range of the1041// single register and may force spilling.1042MachNode* mach = self->is_Mach() ? self->as_Mach() : NULL;1043if (mach && mach->out_RegMask().is_bound1() && mach->out_RegMask().is_NotEmpty())1044in_latency = true;10451046#ifndef PRODUCT1047if (trace_opto_pipelining()) {1048tty->print("# Find cheaper block for latency %d: ", get_latency_for_node(self));1049self->dump();1050tty->print_cr("# B%d: start latency for [%4d]=%d, end latency for [%4d]=%d, freq=%g",1051LCA->_pre_order,1052LCA->head()->_idx,1053start_latency,1054LCA->get_node(LCA->end_idx())->_idx,1055end_latency,1056least_freq);1057}1058#endif10591060int cand_cnt = 0; // number of candidates tried10611062// Walk up the dominator tree from LCA (Lowest common ancestor) to1063// the earliest legal location. Capture the least execution frequency.1064while (LCA != early) {1065LCA = LCA->_idom; // Follow up the dominator tree10661067if (LCA == NULL) {1068// Bailout without retry1069assert(false, "graph should be schedulable");1070C->record_method_not_compilable("late schedule failed: LCA == NULL");1071return least;1072}10731074// Don't hoist machine instructions to the root basic block1075if (mach && LCA == root_block)1076break;10771078uint start_lat = get_latency_for_node(LCA->head());1079uint end_idx = LCA->end_idx();1080uint end_lat = get_latency_for_node(LCA->get_node(end_idx));1081double LCA_freq = LCA->_freq;1082#ifndef PRODUCT1083if (trace_opto_pipelining()) {1084tty->print_cr("# B%d: start latency for [%4d]=%d, end latency for [%4d]=%d, freq=%g",1085LCA->_pre_order, LCA->head()->_idx, start_lat, end_idx, end_lat, LCA_freq);1086}1087#endif1088cand_cnt++;1089if (LCA_freq < least_freq || // Better Frequency1090(StressGCM && Compile::randomized_select(cand_cnt)) || // Should be randomly accepted in stress mode1091(!StressGCM && // Otherwise, choose with latency1092!in_latency && // No block containing latency1093LCA_freq < least_freq * delta && // No worse frequency1094target >= end_lat && // within latency range1095!self->is_iteratively_computed() ) // But don't hoist IV increments1096// because they may end up above other uses of their phi forcing1097// their result register to be different from their input.1098) {1099least = LCA; // Found cheaper block1100least_freq = LCA_freq;1101start_latency = start_lat;1102end_latency = end_lat;1103if (target <= start_lat)1104in_latency = true;1105}1106}11071108#ifndef PRODUCT1109if (trace_opto_pipelining()) {1110tty->print_cr("# Choose block B%d with start latency=%d and freq=%g",1111least->_pre_order, start_latency, least_freq);1112}1113#endif11141115// See if the latency needs to be updated1116if (target < end_latency) {1117#ifndef PRODUCT1118if (trace_opto_pipelining()) {1119tty->print_cr("# Change latency for [%4d] from %d to %d", self->_idx, target, end_latency);1120}1121#endif1122set_latency_for_node(self, end_latency);1123partial_latency_of_defs(self);1124}11251126return least;1127}112811291130//------------------------------schedule_late-----------------------------------1131// Now schedule all codes as LATE as possible. This is the LCA in the1132// dominator tree of all USES of a value. Pick the block with the least1133// loop nesting depth that is lowest in the dominator tree.1134extern const char must_clone[];1135void PhaseCFG::schedule_late(VectorSet &visited, Node_List &stack) {1136#ifndef PRODUCT1137if (trace_opto_pipelining())1138tty->print("\n#---- schedule_late ----\n");1139#endif11401141Node_Backward_Iterator iter((Node *)_root, visited, stack, *this);1142Node *self;11431144// Walk over all the nodes from last to first1145while (self = iter.next()) {1146Block* early = get_block_for_node(self); // Earliest legal placement11471148if (self->is_top()) {1149// Top node goes in bb #2 with other constants.1150// It must be special-cased, because it has no out edges.1151early->add_inst(self);1152continue;1153}11541155// No uses, just terminate1156if (self->outcnt() == 0) {1157assert(self->is_MachProj(), "sanity");1158continue; // Must be a dead machine projection1159}11601161// If node is pinned in the block, then no scheduling can be done.1162if( self->pinned() ) // Pinned in block?1163continue;11641165MachNode* mach = self->is_Mach() ? self->as_Mach() : NULL;1166if (mach) {1167switch (mach->ideal_Opcode()) {1168case Op_CreateEx:1169// Don't move exception creation1170early->add_inst(self);1171continue;1172break;1173case Op_CheckCastPP:1174// Don't move CheckCastPP nodes away from their input, if the input1175// is a rawptr (5071820).1176Node *def = self->in(1);1177if (def != NULL && def->bottom_type()->base() == Type::RawPtr) {1178early->add_inst(self);1179#ifdef ASSERT1180_raw_oops.push(def);1181#endif1182continue;1183}1184break;1185}1186}11871188// Gather LCA of all uses1189Block *LCA = NULL;1190{1191for (DUIterator_Fast imax, i = self->fast_outs(imax); i < imax; i++) {1192// For all uses, find LCA1193Node* use = self->fast_out(i);1194LCA = raise_LCA_above_use(LCA, use, self, this);1195}1196} // (Hide defs of imax, i from rest of block.)11971198// Place temps in the block of their use. This isn't a1199// requirement for correctness but it reduces useless1200// interference between temps and other nodes.1201if (mach != NULL && mach->is_MachTemp()) {1202map_node_to_block(self, LCA);1203LCA->add_inst(self);1204continue;1205}12061207// Check if 'self' could be anti-dependent on memory1208if (self->needs_anti_dependence_check()) {1209// Hoist LCA above possible-defs and insert anti-dependences to1210// defs in new LCA block.1211LCA = insert_anti_dependences(LCA, self);1212}12131214if (early->_dom_depth > LCA->_dom_depth) {1215// Somehow the LCA has moved above the earliest legal point.1216// (One way this can happen is via memory_early_block.)1217if (C->subsume_loads() == true && !C->failing()) {1218// Retry with subsume_loads == false1219// If this is the first failure, the sentinel string will "stick"1220// to the Compile object, and the C2Compiler will see it and retry.1221C->record_failure(C2Compiler::retry_no_subsuming_loads());1222} else {1223// Bailout without retry when (early->_dom_depth > LCA->_dom_depth)1224assert(false, "graph should be schedulable");1225C->record_method_not_compilable("late schedule failed: incorrect graph");1226}1227return;1228}12291230// If there is no opportunity to hoist, then we're done.1231// In stress mode, try to hoist even the single operations.1232bool try_to_hoist = StressGCM || (LCA != early);12331234// Must clone guys stay next to use; no hoisting allowed.1235// Also cannot hoist guys that alter memory or are otherwise not1236// allocatable (hoisting can make a value live longer, leading to1237// anti and output dependency problems which are normally resolved1238// by the register allocator giving everyone a different register).1239if (mach != NULL && must_clone[mach->ideal_Opcode()])1240try_to_hoist = false;12411242Block* late = NULL;1243if (try_to_hoist) {1244// Now find the block with the least execution frequency.1245// Start at the latest schedule and work up to the earliest schedule1246// in the dominator tree. Thus the Node will dominate all its uses.1247late = hoist_to_cheaper_block(LCA, early, self);1248} else {1249// Just use the LCA of the uses.1250late = LCA;1251}12521253// Put the node into target block1254schedule_node_into_block(self, late);12551256#ifdef ASSERT1257if (self->needs_anti_dependence_check()) {1258// since precedence edges are only inserted when we're sure they1259// are needed make sure that after placement in a block we don't1260// need any new precedence edges.1261verify_anti_dependences(late, self);1262}1263#endif1264} // Loop until all nodes have been visited12651266} // end ScheduleLate12671268//------------------------------GlobalCodeMotion-------------------------------1269void PhaseCFG::global_code_motion() {1270ResourceMark rm;12711272#ifndef PRODUCT1273if (trace_opto_pipelining()) {1274tty->print("\n---- Start GlobalCodeMotion ----\n");1275}1276#endif12771278// Initialize the node to block mapping for things on the proj_list1279for (uint i = 0; i < _matcher.number_of_projections(); i++) {1280unmap_node_from_block(_matcher.get_projection(i));1281}12821283// Set the basic block for Nodes pinned into blocks1284Arena* arena = Thread::current()->resource_area();1285VectorSet visited(arena);1286schedule_pinned_nodes(visited);12871288// Find the earliest Block any instruction can be placed in. Some1289// instructions are pinned into Blocks. Unpinned instructions can1290// appear in last block in which all their inputs occur.1291visited.Clear();1292Node_List stack(arena);1293// Pre-grow the list1294stack.map((C->live_nodes() >> 1) + 16, NULL);1295if (!schedule_early(visited, stack)) {1296// Bailout without retry1297C->record_method_not_compilable("early schedule failed");1298return;1299}13001301// Build Def-Use edges.1302// Compute the latency information (via backwards walk) for all the1303// instructions in the graph1304_node_latency = new GrowableArray<uint>(); // resource_area allocation13051306if (C->do_scheduling()) {1307compute_latencies_backwards(visited, stack);1308}13091310// Now schedule all codes as LATE as possible. This is the LCA in the1311// dominator tree of all USES of a value. Pick the block with the least1312// loop nesting depth that is lowest in the dominator tree.1313// ( visited.Clear() called in schedule_late()->Node_Backward_Iterator() )1314schedule_late(visited, stack);1315if (C->failing()) {1316return;1317}13181319#ifndef PRODUCT1320if (trace_opto_pipelining()) {1321tty->print("\n---- Detect implicit null checks ----\n");1322}1323#endif13241325// Detect implicit-null-check opportunities. Basically, find NULL checks1326// with suitable memory ops nearby. Use the memory op to do the NULL check.1327// I can generate a memory op if there is not one nearby.1328if (C->is_method_compilation()) {1329// By reversing the loop direction we get a very minor gain on mpegaudio.1330// Feel free to revert to a forward loop for clarity.1331// for( int i=0; i < (int)matcher._null_check_tests.size(); i+=2 ) {1332for (int i = _matcher._null_check_tests.size() - 2; i >= 0; i -= 2) {1333Node* proj = _matcher._null_check_tests[i];1334Node* val = _matcher._null_check_tests[i + 1];1335Block* block = get_block_for_node(proj);1336implicit_null_check(block, proj, val, C->allowed_deopt_reasons());1337// The implicit_null_check will only perform the transformation1338// if the null branch is truly uncommon, *and* it leads to an1339// uncommon trap. Combined with the too_many_traps guards1340// above, this prevents SEGV storms reported in 6366351,1341// by recompiling offending methods without this optimization.1342}1343}13441345#ifndef PRODUCT1346if (trace_opto_pipelining()) {1347tty->print("\n---- Start Local Scheduling ----\n");1348}1349#endif13501351// Schedule locally. Right now a simple topological sort.1352// Later, do a real latency aware scheduler.1353GrowableArray<int> ready_cnt(C->unique(), C->unique(), -1);1354visited.Clear();1355for (uint i = 0; i < number_of_blocks(); i++) {1356Block* block = get_block(i);1357if (!schedule_local(block, ready_cnt, visited)) {1358if (!C->failure_reason_is(C2Compiler::retry_no_subsuming_loads())) {1359C->record_method_not_compilable("local schedule failed");1360}1361return;1362}1363}13641365// If we inserted any instructions between a Call and his CatchNode,1366// clone the instructions on all paths below the Catch.1367for (uint i = 0; i < number_of_blocks(); i++) {1368Block* block = get_block(i);1369call_catch_cleanup(block);1370}13711372#ifndef PRODUCT1373if (trace_opto_pipelining()) {1374tty->print("\n---- After GlobalCodeMotion ----\n");1375for (uint i = 0; i < number_of_blocks(); i++) {1376Block* block = get_block(i);1377block->dump();1378}1379}1380#endif1381// Dead.1382_node_latency = (GrowableArray<uint> *)((intptr_t)0xdeadbeef);1383}13841385bool PhaseCFG::do_global_code_motion() {13861387build_dominator_tree();1388if (C->failing()) {1389return false;1390}13911392NOT_PRODUCT( C->verify_graph_edges(); )13931394estimate_block_frequency();13951396global_code_motion();13971398if (C->failing()) {1399return false;1400}14011402return true;1403}14041405//------------------------------Estimate_Block_Frequency-----------------------1406// Estimate block frequencies based on IfNode probabilities.1407void PhaseCFG::estimate_block_frequency() {14081409// Force conditional branches leading to uncommon traps to be unlikely,1410// not because we get to the uncommon_trap with less relative frequency,1411// but because an uncommon_trap typically causes a deopt, so we only get1412// there once.1413if (C->do_freq_based_layout()) {1414Block_List worklist;1415Block* root_blk = get_block(0);1416for (uint i = 1; i < root_blk->num_preds(); i++) {1417Block *pb = get_block_for_node(root_blk->pred(i));1418if (pb->has_uncommon_code()) {1419worklist.push(pb);1420}1421}1422while (worklist.size() > 0) {1423Block* uct = worklist.pop();1424if (uct == get_root_block()) {1425continue;1426}1427for (uint i = 1; i < uct->num_preds(); i++) {1428Block *pb = get_block_for_node(uct->pred(i));1429if (pb->_num_succs == 1) {1430worklist.push(pb);1431} else if (pb->num_fall_throughs() == 2) {1432pb->update_uncommon_branch(uct);1433}1434}1435}1436}14371438// Create the loop tree and calculate loop depth.1439_root_loop = create_loop_tree();1440_root_loop->compute_loop_depth(0);14411442// Compute block frequency of each block, relative to a single loop entry.1443_root_loop->compute_freq();14441445// Adjust all frequencies to be relative to a single method entry1446_root_loop->_freq = 1.0;1447_root_loop->scale_freq();14481449// Save outmost loop frequency for LRG frequency threshold1450_outer_loop_frequency = _root_loop->outer_loop_freq();14511452// force paths ending at uncommon traps to be infrequent1453if (!C->do_freq_based_layout()) {1454Block_List worklist;1455Block* root_blk = get_block(0);1456for (uint i = 1; i < root_blk->num_preds(); i++) {1457Block *pb = get_block_for_node(root_blk->pred(i));1458if (pb->has_uncommon_code()) {1459worklist.push(pb);1460}1461}1462while (worklist.size() > 0) {1463Block* uct = worklist.pop();1464uct->_freq = PROB_MIN;1465for (uint i = 1; i < uct->num_preds(); i++) {1466Block *pb = get_block_for_node(uct->pred(i));1467if (pb->_num_succs == 1 && pb->_freq > PROB_MIN) {1468worklist.push(pb);1469}1470}1471}1472}14731474#ifdef ASSERT1475for (uint i = 0; i < number_of_blocks(); i++) {1476Block* b = get_block(i);1477assert(b->_freq >= MIN_BLOCK_FREQUENCY, "Register Allocator requires meaningful block frequency");1478}1479#endif14801481#ifndef PRODUCT1482if (PrintCFGBlockFreq) {1483tty->print_cr("CFG Block Frequencies");1484_root_loop->dump_tree();1485if (Verbose) {1486tty->print_cr("PhaseCFG dump");1487dump();1488tty->print_cr("Node dump");1489_root->dump(99999);1490}1491}1492#endif1493}14941495//----------------------------create_loop_tree--------------------------------1496// Create a loop tree from the CFG1497CFGLoop* PhaseCFG::create_loop_tree() {14981499#ifdef ASSERT1500assert(get_block(0) == get_root_block(), "first block should be root block");1501for (uint i = 0; i < number_of_blocks(); i++) {1502Block* block = get_block(i);1503// Check that _loop field are clear...we could clear them if not.1504assert(block->_loop == NULL, "clear _loop expected");1505// Sanity check that the RPO numbering is reflected in the _blocks array.1506// It doesn't have to be for the loop tree to be built, but if it is not,1507// then the blocks have been reordered since dom graph building...which1508// may question the RPO numbering1509assert(block->_rpo == i, "unexpected reverse post order number");1510}1511#endif15121513int idct = 0;1514CFGLoop* root_loop = new CFGLoop(idct++);15151516Block_List worklist;15171518// Assign blocks to loops1519for(uint i = number_of_blocks() - 1; i > 0; i-- ) { // skip Root block1520Block* block = get_block(i);15211522if (block->head()->is_Loop()) {1523Block* loop_head = block;1524assert(loop_head->num_preds() - 1 == 2, "loop must have 2 predecessors");1525Node* tail_n = loop_head->pred(LoopNode::LoopBackControl);1526Block* tail = get_block_for_node(tail_n);15271528// Defensively filter out Loop nodes for non-single-entry loops.1529// For all reasonable loops, the head occurs before the tail in RPO.1530if (i <= tail->_rpo) {15311532// The tail and (recursive) predecessors of the tail1533// are made members of a new loop.15341535assert(worklist.size() == 0, "nonempty worklist");1536CFGLoop* nloop = new CFGLoop(idct++);1537assert(loop_head->_loop == NULL, "just checking");1538loop_head->_loop = nloop;1539// Add to nloop so push_pred() will skip over inner loops1540nloop->add_member(loop_head);1541nloop->push_pred(loop_head, LoopNode::LoopBackControl, worklist, this);15421543while (worklist.size() > 0) {1544Block* member = worklist.pop();1545if (member != loop_head) {1546for (uint j = 1; j < member->num_preds(); j++) {1547nloop->push_pred(member, j, worklist, this);1548}1549}1550}1551}1552}1553}15541555// Create a member list for each loop consisting1556// of both blocks and (immediate child) loops.1557for (uint i = 0; i < number_of_blocks(); i++) {1558Block* block = get_block(i);1559CFGLoop* lp = block->_loop;1560if (lp == NULL) {1561// Not assigned to a loop. Add it to the method's pseudo loop.1562block->_loop = root_loop;1563lp = root_loop;1564}1565if (lp == root_loop || block != lp->head()) { // loop heads are already members1566lp->add_member(block);1567}1568if (lp != root_loop) {1569if (lp->parent() == NULL) {1570// Not a nested loop. Make it a child of the method's pseudo loop.1571root_loop->add_nested_loop(lp);1572}1573if (block == lp->head()) {1574// Add nested loop to member list of parent loop.1575lp->parent()->add_member(lp);1576}1577}1578}15791580return root_loop;1581}15821583//------------------------------push_pred--------------------------------------1584void CFGLoop::push_pred(Block* blk, int i, Block_List& worklist, PhaseCFG* cfg) {1585Node* pred_n = blk->pred(i);1586Block* pred = cfg->get_block_for_node(pred_n);1587CFGLoop *pred_loop = pred->_loop;1588if (pred_loop == NULL) {1589// Filter out blocks for non-single-entry loops.1590// For all reasonable loops, the head occurs before the tail in RPO.1591if (pred->_rpo > head()->_rpo) {1592pred->_loop = this;1593worklist.push(pred);1594}1595} else if (pred_loop != this) {1596// Nested loop.1597while (pred_loop->_parent != NULL && pred_loop->_parent != this) {1598pred_loop = pred_loop->_parent;1599}1600// Make pred's loop be a child1601if (pred_loop->_parent == NULL) {1602add_nested_loop(pred_loop);1603// Continue with loop entry predecessor.1604Block* pred_head = pred_loop->head();1605assert(pred_head->num_preds() - 1 == 2, "loop must have 2 predecessors");1606assert(pred_head != head(), "loop head in only one loop");1607push_pred(pred_head, LoopNode::EntryControl, worklist, cfg);1608} else {1609assert(pred_loop->_parent == this && _parent == NULL, "just checking");1610}1611}1612}16131614//------------------------------add_nested_loop--------------------------------1615// Make cl a child of the current loop in the loop tree.1616void CFGLoop::add_nested_loop(CFGLoop* cl) {1617assert(_parent == NULL, "no parent yet");1618assert(cl != this, "not my own parent");1619cl->_parent = this;1620CFGLoop* ch = _child;1621if (ch == NULL) {1622_child = cl;1623} else {1624while (ch->_sibling != NULL) { ch = ch->_sibling; }1625ch->_sibling = cl;1626}1627}16281629//------------------------------compute_loop_depth-----------------------------1630// Store the loop depth in each CFGLoop object.1631// Recursively walk the children to do the same for them.1632void CFGLoop::compute_loop_depth(int depth) {1633_depth = depth;1634CFGLoop* ch = _child;1635while (ch != NULL) {1636ch->compute_loop_depth(depth + 1);1637ch = ch->_sibling;1638}1639}16401641//------------------------------compute_freq-----------------------------------1642// Compute the frequency of each block and loop, relative to a single entry1643// into the dominating loop head.1644void CFGLoop::compute_freq() {1645// Bottom up traversal of loop tree (visit inner loops first.)1646// Set loop head frequency to 1.0, then transitively1647// compute frequency for all successors in the loop,1648// as well as for each exit edge. Inner loops are1649// treated as single blocks with loop exit targets1650// as the successor blocks.16511652// Nested loops first1653CFGLoop* ch = _child;1654while (ch != NULL) {1655ch->compute_freq();1656ch = ch->_sibling;1657}1658assert (_members.length() > 0, "no empty loops");1659Block* hd = head();1660hd->_freq = 1.0f;1661for (int i = 0; i < _members.length(); i++) {1662CFGElement* s = _members.at(i);1663float freq = s->_freq;1664if (s->is_block()) {1665Block* b = s->as_Block();1666for (uint j = 0; j < b->_num_succs; j++) {1667Block* sb = b->_succs[j];1668update_succ_freq(sb, freq * b->succ_prob(j));1669}1670} else {1671CFGLoop* lp = s->as_CFGLoop();1672assert(lp->_parent == this, "immediate child");1673for (int k = 0; k < lp->_exits.length(); k++) {1674Block* eb = lp->_exits.at(k).get_target();1675float prob = lp->_exits.at(k).get_prob();1676update_succ_freq(eb, freq * prob);1677}1678}1679}16801681// For all loops other than the outer, "method" loop,1682// sum and normalize the exit probability. The "method" loop1683// should keep the initial exit probability of 1, so that1684// inner blocks do not get erroneously scaled.1685if (_depth != 0) {1686// Total the exit probabilities for this loop.1687float exits_sum = 0.0f;1688for (int i = 0; i < _exits.length(); i++) {1689exits_sum += _exits.at(i).get_prob();1690}16911692// Normalize the exit probabilities. Until now, the1693// probabilities estimate the possibility of exit per1694// a single loop iteration; afterward, they estimate1695// the probability of exit per loop entry.1696for (int i = 0; i < _exits.length(); i++) {1697Block* et = _exits.at(i).get_target();1698float new_prob = 0.0f;1699if (_exits.at(i).get_prob() > 0.0f) {1700new_prob = _exits.at(i).get_prob() / exits_sum;1701}1702BlockProbPair bpp(et, new_prob);1703_exits.at_put(i, bpp);1704}17051706// Save the total, but guard against unreasonable probability,1707// as the value is used to estimate the loop trip count.1708// An infinite trip count would blur relative block1709// frequencies.1710if (exits_sum > 1.0f) exits_sum = 1.0;1711if (exits_sum < PROB_MIN) exits_sum = PROB_MIN;1712_exit_prob = exits_sum;1713}1714}17151716//------------------------------succ_prob-------------------------------------1717// Determine the probability of reaching successor 'i' from the receiver block.1718float Block::succ_prob(uint i) {1719int eidx = end_idx();1720Node *n = get_node(eidx); // Get ending Node17211722int op = n->Opcode();1723if (n->is_Mach()) {1724if (n->is_MachNullCheck()) {1725// Can only reach here if called after lcm. The original Op_If is gone,1726// so we attempt to infer the probability from one or both of the1727// successor blocks.1728assert(_num_succs == 2, "expecting 2 successors of a null check");1729// If either successor has only one predecessor, then the1730// probability estimate can be derived using the1731// relative frequency of the successor and this block.1732if (_succs[i]->num_preds() == 2) {1733return _succs[i]->_freq / _freq;1734} else if (_succs[1-i]->num_preds() == 2) {1735return 1 - (_succs[1-i]->_freq / _freq);1736} else {1737// Estimate using both successor frequencies1738float freq = _succs[i]->_freq;1739return freq / (freq + _succs[1-i]->_freq);1740}1741}1742op = n->as_Mach()->ideal_Opcode();1743}174417451746// Switch on branch type1747switch( op ) {1748case Op_CountedLoopEnd:1749case Op_If: {1750assert (i < 2, "just checking");1751// Conditionals pass on only part of their frequency1752float prob = n->as_MachIf()->_prob;1753assert(prob >= 0.0 && prob <= 1.0, "out of range probability");1754// If succ[i] is the FALSE branch, invert path info1755if( get_node(i + eidx + 1)->Opcode() == Op_IfFalse ) {1756return 1.0f - prob; // not taken1757} else {1758return prob; // taken1759}1760}17611762case Op_Jump:1763// Divide the frequency between all successors evenly1764return 1.0f/_num_succs;17651766case Op_Catch: {1767const CatchProjNode *ci = get_node(i + eidx + 1)->as_CatchProj();1768if (ci->_con == CatchProjNode::fall_through_index) {1769// Fall-thru path gets the lion's share.1770return 1.0f - PROB_UNLIKELY_MAG(5)*_num_succs;1771} else {1772// Presume exceptional paths are equally unlikely1773return PROB_UNLIKELY_MAG(5);1774}1775}17761777case Op_Root:1778case Op_Goto:1779// Pass frequency straight thru to target1780return 1.0f;17811782case Op_NeverBranch:1783return 0.0f;17841785case Op_TailCall:1786case Op_TailJump:1787case Op_Return:1788case Op_Halt:1789case Op_Rethrow:1790// Do not push out freq to root block1791return 0.0f;17921793default:1794ShouldNotReachHere();1795}17961797return 0.0f;1798}17991800//------------------------------num_fall_throughs-----------------------------1801// Return the number of fall-through candidates for a block1802int Block::num_fall_throughs() {1803int eidx = end_idx();1804Node *n = get_node(eidx); // Get ending Node18051806int op = n->Opcode();1807if (n->is_Mach()) {1808if (n->is_MachNullCheck()) {1809// In theory, either side can fall-thru, for simplicity sake,1810// let's say only the false branch can now.1811return 1;1812}1813op = n->as_Mach()->ideal_Opcode();1814}18151816// Switch on branch type1817switch( op ) {1818case Op_CountedLoopEnd:1819case Op_If:1820return 2;18211822case Op_Root:1823case Op_Goto:1824return 1;18251826case Op_Catch: {1827for (uint i = 0; i < _num_succs; i++) {1828const CatchProjNode *ci = get_node(i + eidx + 1)->as_CatchProj();1829if (ci->_con == CatchProjNode::fall_through_index) {1830return 1;1831}1832}1833return 0;1834}18351836case Op_Jump:1837case Op_NeverBranch:1838case Op_TailCall:1839case Op_TailJump:1840case Op_Return:1841case Op_Halt:1842case Op_Rethrow:1843return 0;18441845default:1846ShouldNotReachHere();1847}18481849return 0;1850}18511852//------------------------------succ_fall_through-----------------------------1853// Return true if a specific successor could be fall-through target.1854bool Block::succ_fall_through(uint i) {1855int eidx = end_idx();1856Node *n = get_node(eidx); // Get ending Node18571858int op = n->Opcode();1859if (n->is_Mach()) {1860if (n->is_MachNullCheck()) {1861// In theory, either side can fall-thru, for simplicity sake,1862// let's say only the false branch can now.1863return get_node(i + eidx + 1)->Opcode() == Op_IfFalse;1864}1865op = n->as_Mach()->ideal_Opcode();1866}18671868// Switch on branch type1869switch( op ) {1870case Op_CountedLoopEnd:1871case Op_If:1872case Op_Root:1873case Op_Goto:1874return true;18751876case Op_Catch: {1877const CatchProjNode *ci = get_node(i + eidx + 1)->as_CatchProj();1878return ci->_con == CatchProjNode::fall_through_index;1879}18801881case Op_Jump:1882case Op_NeverBranch:1883case Op_TailCall:1884case Op_TailJump:1885case Op_Return:1886case Op_Halt:1887case Op_Rethrow:1888return false;18891890default:1891ShouldNotReachHere();1892}18931894return false;1895}18961897//------------------------------update_uncommon_branch------------------------1898// Update the probability of a two-branch to be uncommon1899void Block::update_uncommon_branch(Block* ub) {1900int eidx = end_idx();1901Node *n = get_node(eidx); // Get ending Node19021903int op = n->as_Mach()->ideal_Opcode();19041905assert(op == Op_CountedLoopEnd || op == Op_If, "must be a If");1906assert(num_fall_throughs() == 2, "must be a two way branch block");19071908// Which successor is ub?1909uint s;1910for (s = 0; s <_num_succs; s++) {1911if (_succs[s] == ub) break;1912}1913assert(s < 2, "uncommon successor must be found");19141915// If ub is the true path, make the proability small, else1916// ub is the false path, and make the probability large1917bool invert = (get_node(s + eidx + 1)->Opcode() == Op_IfFalse);19181919// Get existing probability1920float p = n->as_MachIf()->_prob;19211922if (invert) p = 1.0 - p;1923if (p > PROB_MIN) {1924p = PROB_MIN;1925}1926if (invert) p = 1.0 - p;19271928n->as_MachIf()->_prob = p;1929}19301931//------------------------------update_succ_freq-------------------------------1932// Update the appropriate frequency associated with block 'b', a successor of1933// a block in this loop.1934void CFGLoop::update_succ_freq(Block* b, float freq) {1935if (b->_loop == this) {1936if (b == head()) {1937// back branch within the loop1938// Do nothing now, the loop carried frequency will be1939// adjust later in scale_freq().1940} else {1941// simple branch within the loop1942b->_freq += freq;1943}1944} else if (!in_loop_nest(b)) {1945// branch is exit from this loop1946BlockProbPair bpp(b, freq);1947_exits.append(bpp);1948} else {1949// branch into nested loop1950CFGLoop* ch = b->_loop;1951ch->_freq += freq;1952}1953}19541955//------------------------------in_loop_nest-----------------------------------1956// Determine if block b is in the receiver's loop nest.1957bool CFGLoop::in_loop_nest(Block* b) {1958int depth = _depth;1959CFGLoop* b_loop = b->_loop;1960int b_depth = b_loop->_depth;1961if (depth == b_depth) {1962return true;1963}1964while (b_depth > depth) {1965b_loop = b_loop->_parent;1966b_depth = b_loop->_depth;1967}1968return b_loop == this;1969}19701971//------------------------------scale_freq-------------------------------------1972// Scale frequency of loops and blocks by trip counts from outer loops1973// Do a top down traversal of loop tree (visit outer loops first.)1974void CFGLoop::scale_freq() {1975float loop_freq = _freq * trip_count();1976_freq = loop_freq;1977for (int i = 0; i < _members.length(); i++) {1978CFGElement* s = _members.at(i);1979float block_freq = s->_freq * loop_freq;1980if (g_isnan(block_freq) || block_freq < MIN_BLOCK_FREQUENCY)1981block_freq = MIN_BLOCK_FREQUENCY;1982s->_freq = block_freq;1983}1984CFGLoop* ch = _child;1985while (ch != NULL) {1986ch->scale_freq();1987ch = ch->_sibling;1988}1989}19901991// Frequency of outer loop1992float CFGLoop::outer_loop_freq() const {1993if (_child != NULL) {1994return _child->_freq;1995}1996return _freq;1997}19981999#ifndef PRODUCT2000//------------------------------dump_tree--------------------------------------2001void CFGLoop::dump_tree() const {2002dump();2003if (_child != NULL) _child->dump_tree();2004if (_sibling != NULL) _sibling->dump_tree();2005}20062007//------------------------------dump-------------------------------------------2008void CFGLoop::dump() const {2009for (int i = 0; i < _depth; i++) tty->print(" ");2010tty->print("%s: %d trip_count: %6.0f freq: %6.0f\n",2011_depth == 0 ? "Method" : "Loop", _id, trip_count(), _freq);2012for (int i = 0; i < _depth; i++) tty->print(" ");2013tty->print(" members:");2014int k = 0;2015for (int i = 0; i < _members.length(); i++) {2016if (k++ >= 6) {2017tty->print("\n ");2018for (int j = 0; j < _depth+1; j++) tty->print(" ");2019k = 0;2020}2021CFGElement *s = _members.at(i);2022if (s->is_block()) {2023Block *b = s->as_Block();2024tty->print(" B%d(%6.3f)", b->_pre_order, b->_freq);2025} else {2026CFGLoop* lp = s->as_CFGLoop();2027tty->print(" L%d(%6.3f)", lp->_id, lp->_freq);2028}2029}2030tty->print("\n");2031for (int i = 0; i < _depth; i++) tty->print(" ");2032tty->print(" exits: ");2033k = 0;2034for (int i = 0; i < _exits.length(); i++) {2035if (k++ >= 7) {2036tty->print("\n ");2037for (int j = 0; j < _depth+1; j++) tty->print(" ");2038k = 0;2039}2040Block *blk = _exits.at(i).get_target();2041float prob = _exits.at(i).get_prob();2042tty->print(" ->%d@%d%%", blk->_pre_order, (int)(prob*100));2043}2044tty->print("\n");2045}2046#endif204720482049