Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
bytecodealliance
GitHub Repository: bytecodealliance/wasmtime
Path: blob/main/cranelift/codegen/src/egraph.rs
1693 views
1
//! Support for egraphs represented in the DataFlowGraph.
2
3
use crate::alias_analysis::{AliasAnalysis, LastStores};
4
use crate::ctxhash::{CtxEq, CtxHash, NullCtx};
5
use crate::cursor::{Cursor, CursorPosition, FuncCursor};
6
use crate::dominator_tree::DominatorTree;
7
use crate::egraph::elaborate::Elaborator;
8
use crate::inst_predicates::{is_mergeable_for_egraph, is_pure_for_egraph};
9
use crate::ir::pcc::Fact;
10
use crate::ir::{
11
Block, DataFlowGraph, Function, Inst, InstructionData, Opcode, Type, Value, ValueDef,
12
ValueListPool,
13
};
14
use crate::loop_analysis::LoopAnalysis;
15
use crate::opts::IsleContext;
16
use crate::opts::generated_code::SkeletonInstSimplification;
17
use crate::scoped_hash_map::{Entry as ScopedEntry, ScopedHashMap};
18
use crate::settings::Flags;
19
use crate::take_and_replace::TakeAndReplace;
20
use crate::trace;
21
use alloc::vec::Vec;
22
use core::cmp::Ordering;
23
use core::hash::Hasher;
24
use cranelift_control::ControlPlane;
25
use cranelift_entity::SecondaryMap;
26
use cranelift_entity::packed_option::ReservedValue;
27
use rustc_hash::FxHashSet;
28
use smallvec::SmallVec;
29
30
mod cost;
31
mod elaborate;
32
33
/// Pass over a Function that does the whole aegraph thing.
34
///
35
/// - Removes non-skeleton nodes from the Layout.
36
/// - Performs a GVN-and-rule-application pass over all Values
37
/// reachable from the skeleton, potentially creating new Union
38
/// nodes (i.e., an aegraph) so that some values have multiple
39
/// representations.
40
/// - Does "extraction" on the aegraph: selects the best value out of
41
/// the tree-of-Union nodes for each used value.
42
/// - Does "scoped elaboration" on the aegraph: chooses one or more
43
/// locations for pure nodes to become instructions again in the
44
/// layout, as forced by the skeleton.
45
///
46
/// At the beginning and end of this pass, the CLIF should be in a
47
/// state that passes the verifier and, additionally, has no Union
48
/// nodes. During the pass, Union nodes may exist, and instructions in
49
/// the layout may refer to results of instructions that are not
50
/// placed in the layout.
51
pub struct EgraphPass<'a> {
52
/// The function we're operating on.
53
func: &'a mut Function,
54
/// Dominator tree for the CFG, used to visit blocks in pre-order
55
/// so we see value definitions before their uses, and also used for
56
/// O(1) dominance checks.
57
domtree: &'a DominatorTree,
58
/// Alias analysis, used during optimization.
59
alias_analysis: &'a mut AliasAnalysis<'a>,
60
/// Loop analysis results, used for built-in LICM during
61
/// elaboration.
62
loop_analysis: &'a LoopAnalysis,
63
/// Compiler flags.
64
flags: &'a Flags,
65
/// Chaos-mode control-plane so we can test that we still get
66
/// correct results when our heuristics make bad decisions.
67
ctrl_plane: &'a mut ControlPlane,
68
/// Which Values do we want to rematerialize in each block where
69
/// they're used?
70
remat_values: FxHashSet<Value>,
71
/// Stats collected while we run this pass.
72
pub(crate) stats: Stats,
73
}
74
75
/// The maximum number of rewrites we will take from a single call into ISLE.
76
const MATCHES_LIMIT: usize = 5;
77
78
/// The maximum number of enodes in any given eclass.
79
const ECLASS_ENODE_LIMIT: usize = 5;
80
81
/// Context passed through node insertion and optimization.
82
pub(crate) struct OptimizeCtx<'opt, 'analysis>
83
where
84
'analysis: 'opt,
85
{
86
// Borrowed from EgraphPass:
87
pub(crate) func: &'opt mut Function,
88
pub(crate) value_to_opt_value: &'opt mut SecondaryMap<Value, Value>,
89
available_block: &'opt mut SecondaryMap<Value, Block>,
90
eclass_size: &'opt mut SecondaryMap<Value, u8>,
91
pub(crate) gvn_map: &'opt mut ScopedHashMap<(Type, InstructionData), Option<Value>>,
92
pub(crate) gvn_map_blocks: &'opt Vec<Block>,
93
pub(crate) remat_values: &'opt mut FxHashSet<Value>,
94
pub(crate) stats: &'opt mut Stats,
95
domtree: &'opt DominatorTree,
96
pub(crate) alias_analysis: &'opt mut AliasAnalysis<'analysis>,
97
pub(crate) alias_analysis_state: &'opt mut LastStores,
98
flags: &'opt Flags,
99
ctrl_plane: &'opt mut ControlPlane,
100
// Held locally during optimization of one node (recursively):
101
pub(crate) rewrite_depth: usize,
102
pub(crate) subsume_values: FxHashSet<Value>,
103
optimized_values: SmallVec<[Value; MATCHES_LIMIT]>,
104
optimized_insts: SmallVec<[SkeletonInstSimplification; MATCHES_LIMIT]>,
105
}
106
107
/// For passing to `insert_pure_enode`. Sometimes the enode already
108
/// exists as an Inst (from the original CLIF), and sometimes we're in
109
/// the middle of creating it and want to avoid inserting it if
110
/// possible until we know we need it.
111
pub(crate) enum NewOrExistingInst {
112
New(InstructionData, Type),
113
Existing(Inst),
114
}
115
116
impl NewOrExistingInst {
117
fn get_inst_key<'a>(&'a self, dfg: &'a DataFlowGraph) -> (Type, InstructionData) {
118
match self {
119
NewOrExistingInst::New(data, ty) => (*ty, *data),
120
NewOrExistingInst::Existing(inst) => {
121
let ty = dfg.ctrl_typevar(*inst);
122
(ty, dfg.insts[*inst])
123
}
124
}
125
}
126
}
127
128
impl<'opt, 'analysis> OptimizeCtx<'opt, 'analysis>
129
where
130
'analysis: 'opt,
131
{
132
/// Optimization of a single instruction.
133
///
134
/// This does a few things:
135
/// - Looks up the instruction in the GVN deduplication map. If we
136
/// already have the same instruction somewhere else, with the
137
/// same args, then we can alias the original instruction's
138
/// results and omit this instruction entirely.
139
/// - If the instruction is "new" (not deduplicated), then apply
140
/// optimization rules:
141
/// - All of the mid-end rules written in ISLE.
142
/// - Store-to-load forwarding.
143
/// - Update the value-to-opt-value map, and update the eclass
144
/// union-find, if we rewrote the value to different form(s).
145
pub(crate) fn insert_pure_enode(&mut self, inst: NewOrExistingInst) -> Value {
146
// Create the external context for looking up and updating the
147
// GVN map. This is necessary so that instructions themselves
148
// do not have to carry all the references or data for a full
149
// `Eq` or `Hash` impl.
150
let gvn_context = GVNContext {
151
value_lists: &self.func.dfg.value_lists,
152
};
153
154
self.stats.pure_inst += 1;
155
if let NewOrExistingInst::New(..) = inst {
156
self.stats.new_inst += 1;
157
}
158
159
// Does this instruction already exist? If so, add entries to
160
// the value-map to rewrite uses of its results to the results
161
// of the original (existing) instruction. If not, optimize
162
// the new instruction.
163
if let Some(&Some(orig_result)) = self
164
.gvn_map
165
.get(&gvn_context, &inst.get_inst_key(&self.func.dfg))
166
{
167
self.stats.pure_inst_deduped += 1;
168
if let NewOrExistingInst::Existing(inst) = inst {
169
debug_assert_eq!(self.func.dfg.inst_results(inst).len(), 1);
170
let result = self.func.dfg.first_result(inst);
171
self.value_to_opt_value[result] = orig_result;
172
self.available_block[result] = self.available_block[orig_result];
173
self.func.dfg.merge_facts(result, orig_result);
174
}
175
orig_result
176
} else {
177
// Now actually insert the InstructionData and attach
178
// result value (exactly one).
179
let (inst, result, ty) = match inst {
180
NewOrExistingInst::New(data, typevar) => {
181
self.stats.pure_inst_insert_new += 1;
182
let inst = self.func.dfg.make_inst(data);
183
// TODO: reuse return value?
184
self.func.dfg.make_inst_results(inst, typevar);
185
let result = self.func.dfg.first_result(inst);
186
// New inst. We need to do the analysis of its result.
187
(inst, result, typevar)
188
}
189
NewOrExistingInst::Existing(inst) => {
190
self.stats.pure_inst_insert_orig += 1;
191
let result = self.func.dfg.first_result(inst);
192
let ty = self.func.dfg.ctrl_typevar(inst);
193
(inst, result, ty)
194
}
195
};
196
197
self.attach_constant_fact(inst, result, ty);
198
199
self.available_block[result] = self.get_available_block(inst);
200
let opt_value = self.optimize_pure_enode(inst);
201
log::trace!("optimizing inst {inst} orig result {result} gave {opt_value}");
202
203
let gvn_context = GVNContext {
204
value_lists: &self.func.dfg.value_lists,
205
};
206
// Insert at level implied by args. This enables merging
207
// in LICM cases like:
208
//
209
// while (...) {
210
// if (...) {
211
// let x = loop_invariant_expr;
212
// }
213
// if (...) {
214
// let x = loop_invariant_expr;
215
// }
216
// }
217
//
218
// where the two instances of the expression otherwise
219
// wouldn't merge because each would be in a separate
220
// subscope of the scoped hashmap during traversal.
221
log::trace!(
222
"value {} is available at {}",
223
opt_value,
224
self.available_block[opt_value]
225
);
226
let depth = self.depth_of_block_in_gvn_map(self.available_block[opt_value]);
227
self.gvn_map.insert_with_depth(
228
&gvn_context,
229
(ty, self.func.dfg.insts[inst]),
230
Some(opt_value),
231
depth,
232
);
233
self.value_to_opt_value[result] = opt_value;
234
opt_value
235
}
236
}
237
238
/// Find the block where a pure instruction first becomes available,
239
/// defined as the block that is closest to the root where all of
240
/// its arguments are available. In the unusual case where a pure
241
/// instruction has no arguments (e.g. get_return_address), we can
242
/// place it anywhere, so it is available in the entry block.
243
///
244
/// This function does not compute available blocks recursively.
245
/// All of the instruction's arguments must have had their available
246
/// blocks assigned already.
247
fn get_available_block(&self, inst: Inst) -> Block {
248
// Side-effecting instructions have different rules for where
249
// they become available, so this function does not apply.
250
debug_assert!(is_pure_for_egraph(self.func, inst));
251
252
// Note that the def-point of all arguments to an instruction
253
// in SSA lie on a line of direct ancestors in the domtree, and
254
// so do their available-blocks. This means that for any pair of
255
// arguments, their available blocks are either the same or one
256
// strictly dominates the other. We just need to find any argument
257
// whose available block is deepest in the domtree.
258
self.func.dfg.insts[inst]
259
.arguments(&self.func.dfg.value_lists)
260
.iter()
261
.map(|&v| {
262
let block = self.available_block[v];
263
debug_assert!(!block.is_reserved_value());
264
block
265
})
266
.max_by(|&x, &y| {
267
if self.domtree.block_dominates(x, y) {
268
Ordering::Less
269
} else {
270
debug_assert!(self.domtree.block_dominates(y, x));
271
Ordering::Greater
272
}
273
})
274
.unwrap_or(self.func.layout.entry_block().unwrap())
275
}
276
277
fn depth_of_block_in_gvn_map(&self, block: Block) -> usize {
278
log::trace!(
279
"finding depth of available block {} in domtree stack: {:?}",
280
block,
281
self.gvn_map_blocks
282
);
283
self.gvn_map_blocks
284
.iter()
285
.enumerate()
286
.rev()
287
.find(|&(_, b)| *b == block)
288
.unwrap()
289
.0
290
}
291
292
/// Optimizes an enode by applying any matching mid-end rewrite
293
/// rules (or store-to-load forwarding, which is a special case),
294
/// unioning together all possible optimized (or rewritten) forms
295
/// of this expression into an eclass and returning the `Value`
296
/// that represents that eclass.
297
fn optimize_pure_enode(&mut self, inst: Inst) -> Value {
298
// A pure node always has exactly one result.
299
let orig_value = self.func.dfg.first_result(inst);
300
301
let mut guard = TakeAndReplace::new(self, |x| &mut x.optimized_values);
302
let (ctx, optimized_values) = guard.get();
303
304
// Limit rewrite depth. When we apply optimization rules, they
305
// may create new nodes (values) and those are, recursively,
306
// optimized eagerly as soon as they are created. So we may
307
// have more than one ISLE invocation on the stack. (This is
308
// necessary so that as the toplevel builds the
309
// right-hand-side expression bottom-up, it uses the "latest"
310
// optimized values for all the constituent parts.) To avoid
311
// infinite or problematic recursion, we bound the rewrite
312
// depth to a small constant here.
313
const REWRITE_LIMIT: usize = 5;
314
if ctx.rewrite_depth > REWRITE_LIMIT {
315
ctx.stats.rewrite_depth_limit += 1;
316
return orig_value;
317
}
318
ctx.rewrite_depth += 1;
319
trace!("Incrementing rewrite depth; now {}", ctx.rewrite_depth);
320
321
// Invoke the ISLE toplevel constructor, getting all new
322
// values produced as equivalents to this value.
323
trace!("Calling into ISLE with original value {}", orig_value);
324
ctx.stats.rewrite_rule_invoked += 1;
325
debug_assert!(optimized_values.is_empty());
326
crate::opts::generated_code::constructor_simplify(
327
&mut IsleContext { ctx },
328
orig_value,
329
optimized_values,
330
);
331
332
ctx.stats.rewrite_rule_results += optimized_values.len() as u64;
333
334
// It's not supposed to matter what order `simplify` returns values in.
335
ctx.ctrl_plane.shuffle(optimized_values);
336
337
let num_matches = optimized_values.len();
338
if num_matches > MATCHES_LIMIT {
339
trace!(
340
"Reached maximum matches limit; too many optimized values \
341
({num_matches} > {MATCHES_LIMIT}); ignoring rest.",
342
);
343
optimized_values.truncate(MATCHES_LIMIT);
344
}
345
346
// Sort and deduplicate optimized values, in case multiple
347
// rules produced the same simplification.
348
optimized_values.sort_unstable();
349
optimized_values.dedup();
350
351
trace!(" -> returned from ISLE: {orig_value} -> {optimized_values:?}");
352
353
// Construct a union-node tree representing the new eclass
354
// that results from rewriting. If any returned value was
355
// marked "subsume", take only that value. Otherwise,
356
// sequentially build the chain over the original value and
357
// all returned values.
358
let result_value = if let Some(&subsuming_value) = optimized_values
359
.iter()
360
.find(|&value| ctx.subsume_values.contains(value))
361
{
362
optimized_values.clear();
363
ctx.stats.pure_inst_subsume += 1;
364
subsuming_value
365
} else {
366
let mut union_value = orig_value;
367
let mut eclass_size = ctx.eclass_size[orig_value] + 1;
368
for optimized_value in optimized_values.drain(..) {
369
trace!(
370
"Returned from ISLE for {}, got {:?}",
371
orig_value, optimized_value
372
);
373
if optimized_value == orig_value {
374
trace!(" -> same as orig value; skipping");
375
ctx.stats.pure_inst_rewrite_to_self += 1;
376
continue;
377
}
378
let rhs_eclass_size = ctx.eclass_size[optimized_value] + 1;
379
if usize::from(eclass_size) + usize::from(rhs_eclass_size) > ECLASS_ENODE_LIMIT {
380
trace!(" -> reached eclass size limit");
381
ctx.stats.eclass_size_limit += 1;
382
break;
383
}
384
let old_union_value = union_value;
385
union_value = ctx.func.dfg.union(old_union_value, optimized_value);
386
eclass_size += rhs_eclass_size;
387
ctx.eclass_size[union_value] = eclass_size - 1;
388
ctx.stats.union += 1;
389
trace!(" -> union: now {}", union_value);
390
ctx.func.dfg.merge_facts(old_union_value, optimized_value);
391
ctx.available_block[union_value] =
392
ctx.merge_availability(old_union_value, optimized_value);
393
}
394
union_value
395
};
396
397
ctx.rewrite_depth -= 1;
398
trace!("Decrementing rewrite depth; now {}", ctx.rewrite_depth);
399
if ctx.rewrite_depth == 0 {
400
ctx.subsume_values.clear();
401
}
402
403
debug_assert!(ctx.optimized_values.is_empty());
404
result_value
405
}
406
407
fn merge_availability(&self, a: Value, b: Value) -> Block {
408
let a = self.available_block[a];
409
let b = self.available_block[b];
410
if self.domtree.block_dominates(a, b) {
411
a
412
} else {
413
b
414
}
415
}
416
417
/// Optimize a "skeleton" instruction.
418
///
419
/// Returns an optional command of how to continue processing the optimized
420
/// instruction (e.g. removing it or replacing it with a new instruction).
421
fn optimize_skeleton_inst(
422
&mut self,
423
inst: Inst,
424
block: Block,
425
) -> Option<SkeletonInstSimplification> {
426
self.stats.skeleton_inst += 1;
427
428
// If we have a rewrite rule for this instruction, do that first, so
429
// that GVN and alias analysis only see simplified skeleton
430
// instructions.
431
if let Some(cmd) = self.simplify_skeleton_inst(inst) {
432
self.stats.skeleton_inst_simplified += 1;
433
return Some(cmd);
434
}
435
436
// First, can we try to deduplicate? We need to keep some copy
437
// of the instruction around because it's side-effecting, but
438
// we may be able to reuse an earlier instance of it.
439
if is_mergeable_for_egraph(self.func, inst) {
440
let result = self.func.dfg.inst_results(inst).get(0).copied();
441
trace!(" -> mergeable side-effecting op {}", inst);
442
443
// Does this instruction already exist? If so, add entries to
444
// the value-map to rewrite uses of its results to the results
445
// of the original (existing) instruction. If not, optimize
446
// the new instruction.
447
//
448
// Note that the GVN map is scoped, which is important
449
// here: because effectful ops are not removed from the
450
// skeleton (`Layout`), we need to be mindful of whether
451
// our current position is dominated by an instance of the
452
// instruction. (See #5796 for details.)
453
let ty = self.func.dfg.ctrl_typevar(inst);
454
match self
455
.gvn_map
456
.entry(&NullCtx, (ty, self.func.dfg.insts[inst]))
457
{
458
ScopedEntry::Occupied(o) => {
459
let orig_result = *o.get();
460
match (result, orig_result) {
461
(Some(result), Some(orig_result)) => {
462
// Hit in GVN map -- reuse value.
463
self.stats.skeleton_inst_gvn += 1;
464
self.value_to_opt_value[result] = orig_result;
465
self.available_block[result] = self.available_block[orig_result];
466
trace!(" -> merges result {} to {}", result, orig_result);
467
}
468
(None, None) => {
469
// Hit in the GVN map, but the instruction doesn't
470
// produce results, only side effects. Nothing else
471
// to do here.
472
self.stats.skeleton_inst_gvn += 1;
473
trace!(" -> merges with dominating instruction");
474
}
475
(_, _) => unreachable!(),
476
}
477
Some(SkeletonInstSimplification::Remove)
478
}
479
ScopedEntry::Vacant(v) => {
480
// Otherwise, insert it into the value-map.
481
if let Some(result) = result {
482
self.value_to_opt_value[result] = result;
483
self.available_block[result] = block;
484
}
485
v.insert(result);
486
trace!(" -> inserts as new (no GVN)");
487
None
488
}
489
}
490
}
491
// Otherwise, if a load or store, process it with the alias
492
// analysis to see if we can optimize it (rewrite in terms of
493
// an earlier load or stored value).
494
else if let Some(new_result) =
495
self.alias_analysis
496
.process_inst(self.func, self.alias_analysis_state, inst)
497
{
498
self.stats.alias_analysis_removed += 1;
499
let result = self.func.dfg.first_result(inst);
500
trace!(
501
" -> inst {} has result {} replaced with {}",
502
inst, result, new_result
503
);
504
self.value_to_opt_value[result] = new_result;
505
self.available_block[result] = self.available_block[new_result];
506
self.func.dfg.merge_facts(result, new_result);
507
Some(SkeletonInstSimplification::Remove)
508
}
509
// Otherwise, generic side-effecting op -- always keep it, and
510
// set its results to identity-map to original values.
511
else {
512
// Set all results to identity-map to themselves
513
// in the value-to-opt-value map.
514
for &result in self.func.dfg.inst_results(inst) {
515
self.value_to_opt_value[result] = result;
516
self.available_block[result] = block;
517
}
518
None
519
}
520
}
521
522
/// Find the best simplification of the given skeleton instruction, if any,
523
/// by consulting our `simplify_skeleton` ISLE rules.
524
fn simplify_skeleton_inst(&mut self, inst: Inst) -> Option<SkeletonInstSimplification> {
525
// We cannot currently simplify terminators, or simplify into
526
// terminators. Anything that could change the control-flow graph is off
527
// limits.
528
//
529
// Consider the following CLIF snippet:
530
//
531
// block0(v0: i64):
532
// v1 = iconst.i32 0
533
// trapz v1, user42
534
// v2 = load.i32 v0
535
// brif v1, block1, block2
536
// block1:
537
// return v2
538
// block2:
539
// v3 = iconst.i32 1
540
// v4 = iadd v2, v3
541
// return v4
542
//
543
// We would ideally like to perform simplifications like replacing the
544
// `trapz` with an unconditional `trap` and the conditional `brif`
545
// branch with an unconditional `jump`. Note, however, that blocks
546
// `block1` and `block2` are dominated by `block0` and therefore can and
547
// do use values defined in `block0`. This presents challenges:
548
//
549
// * If we replace the `brif` with a `jump`, then we've mutated the
550
// control-flow graph and removed that domination property. The uses
551
// of `v2` and `v3` in those blocks become invalid.
552
//
553
// * Even worse, if we turn the `trapz` into a `trap`, we are
554
// introducing a terminator into the middle of the block, which leaves
555
// us with two choices to fix up the IR so that there aren't any
556
// instructions following the terminator in the block:
557
//
558
// 1. We can split the unreachable instructions off into a new
559
// block. However, there is no control-flow edge from the current
560
// block to this new block and so, again, the new block isn't
561
// dominated by the current block, and therefore the can't use
562
// values defined in this block or any dominating it. The `load`
563
// instruction uses `v0` but is not dominated by `v0`'s
564
// definition.
565
//
566
// 2. Alternatively, we can simply delete the trailing instructions,
567
// since they are unreachable. But then not only are the old
568
// instructions' uses no longer dominated by their definitions, but
569
// the definitions do not exist at all anymore!
570
//
571
// Whatever approach we would take, we would invalidate value uses, and
572
// would need to track and fix them up.
573
if self.func.dfg.insts[inst].opcode().is_branch() {
574
return None;
575
}
576
577
let mut guard = TakeAndReplace::new(self, |x| &mut x.optimized_insts);
578
let (ctx, optimized_insts) = guard.get();
579
580
crate::opts::generated_code::constructor_simplify_skeleton(
581
&mut IsleContext { ctx },
582
inst,
583
optimized_insts,
584
);
585
586
let simplifications_len = optimized_insts.len();
587
log::trace!(" -> simplify_skeleton: yielded {simplifications_len} simplification(s)");
588
if simplifications_len > MATCHES_LIMIT {
589
log::trace!(" too many candidate simplifications; truncating to {MATCHES_LIMIT}");
590
optimized_insts.truncate(MATCHES_LIMIT);
591
}
592
593
// Find the best simplification, if any, from our candidates.
594
//
595
// Unlike simplifying pure values, we do not add side-effectful
596
// instructions to the egraph, nor do we extract the best version via
597
// dynamic programming and considering the costs of operands. Instead,
598
// we greedily choose the best simplification. This is because there is
599
// an impedance mismatch: the egraph and our pure rewrites are centered
600
// around *values*, but we don't represent side-effects with values, we
601
// represent them implicitly in their *instructions*.
602
//
603
// The initial best choice is "no simplification, just use the original
604
// instruction" which has the original instruction's cost.
605
let mut best = None;
606
let mut best_cost = cost::Cost::of_skeleton_op(
607
ctx.func.dfg.insts[inst].opcode(),
608
ctx.func.dfg.inst_args(inst).len(),
609
);
610
while let Some(simplification) = optimized_insts.pop() {
611
let (new_inst, new_val) = match simplification {
612
// We can't do better than completely removing the skeleton
613
// instruction, so short-cicuit the loop and eagerly return the
614
// `Remove*` simplifications.
615
SkeletonInstSimplification::Remove => {
616
log::trace!(" -> simplify_skeleton: remove inst");
617
debug_assert!(ctx.func.dfg.inst_results(inst).is_empty());
618
return Some(simplification);
619
}
620
SkeletonInstSimplification::RemoveWithVal { val } => {
621
log::trace!(" -> simplify_skeleton: remove inst and use {val} as its result");
622
if cfg!(debug_assertions) {
623
let results = ctx.func.dfg.inst_results(inst);
624
debug_assert_eq!(results.len(), 1);
625
debug_assert_eq!(
626
ctx.func.dfg.value_type(results[0]),
627
ctx.func.dfg.value_type(val),
628
);
629
}
630
return Some(simplification);
631
}
632
633
// For instruction replacement simplification, we want to check
634
// that the replacements define the same number and types of
635
// values as the original instruction, and also determine
636
// whether they are actually an improvement over (i.e. have
637
// lower cost than) the original instruction.
638
SkeletonInstSimplification::Replace { inst } => {
639
log::trace!(
640
" -> simplify_skeleton: replace inst with {inst}: {}",
641
ctx.func.dfg.display_inst(inst)
642
);
643
(inst, None)
644
}
645
SkeletonInstSimplification::ReplaceWithVal { inst, val } => {
646
log::trace!(
647
" -> simplify_skeleton: replace inst with {val} and {inst}: {}",
648
ctx.func.dfg.display_inst(inst)
649
);
650
(inst, Some(val))
651
}
652
};
653
654
if cfg!(debug_assertions) {
655
let opcode = ctx.func.dfg.insts[inst].opcode();
656
debug_assert!(
657
!(opcode.is_terminator() || opcode.is_branch()),
658
"simplifying control-flow instructions and terminators is not yet supported",
659
);
660
661
let old_vals = ctx.func.dfg.inst_results(inst);
662
let new_vals = if let Some(val) = new_val.as_ref() {
663
std::slice::from_ref(val)
664
} else {
665
ctx.func.dfg.inst_results(new_inst)
666
};
667
debug_assert_eq!(
668
old_vals.len(),
669
new_vals.len(),
670
"skeleton simplification should result in the same number of result values",
671
);
672
673
for (old_val, new_val) in old_vals.iter().zip(new_vals) {
674
let old_ty = ctx.func.dfg.value_type(*old_val);
675
let new_ty = ctx.func.dfg.value_type(*new_val);
676
debug_assert_eq!(
677
old_ty, new_ty,
678
"skeleton simplification should result in values of the correct type",
679
);
680
}
681
}
682
683
// Our best simplification is the one with the least cost. Update
684
// `best` if necessary.
685
let cost = cost::Cost::of_skeleton_op(
686
ctx.func.dfg.insts[new_inst].opcode(),
687
ctx.func.dfg.inst_args(new_inst).len(),
688
);
689
if cost < best_cost {
690
best = Some(simplification);
691
best_cost = cost;
692
}
693
}
694
695
// Return the best simplification!
696
best
697
}
698
699
/// Helper to propagate facts on constant values: if PCC is
700
/// enabled, then unconditionally add a fact attesting to the
701
/// Value's concrete value.
702
fn attach_constant_fact(&mut self, inst: Inst, value: Value, ty: Type) {
703
if self.flags.enable_pcc() {
704
if let InstructionData::UnaryImm {
705
opcode: Opcode::Iconst,
706
imm,
707
} = self.func.dfg.insts[inst]
708
{
709
let imm: i64 = imm.into();
710
self.func.dfg.facts[value] =
711
Some(Fact::constant(ty.bits().try_into().unwrap(), imm as u64));
712
}
713
}
714
}
715
}
716
717
impl<'a> EgraphPass<'a> {
718
/// Create a new EgraphPass.
719
pub fn new(
720
func: &'a mut Function,
721
domtree: &'a DominatorTree,
722
loop_analysis: &'a LoopAnalysis,
723
alias_analysis: &'a mut AliasAnalysis<'a>,
724
flags: &'a Flags,
725
ctrl_plane: &'a mut ControlPlane,
726
) -> Self {
727
Self {
728
func,
729
domtree,
730
loop_analysis,
731
alias_analysis,
732
flags,
733
ctrl_plane,
734
stats: Stats::default(),
735
remat_values: FxHashSet::default(),
736
}
737
}
738
739
/// Run the process.
740
pub fn run(&mut self) {
741
self.remove_pure_and_optimize();
742
743
trace!("egraph built:\n{}\n", self.func.display());
744
if cfg!(feature = "trace-log") {
745
for (value, def) in self.func.dfg.values_and_defs() {
746
trace!(" -> {} = {:?}", value, def);
747
match def {
748
ValueDef::Result(i, 0) => {
749
trace!(" -> {} = {:?}", i, self.func.dfg.insts[i]);
750
}
751
_ => {}
752
}
753
}
754
}
755
756
self.elaborate();
757
758
log::trace!("stats: {:#?}", self.stats);
759
}
760
761
/// Remove pure nodes from the `Layout` of the function, ensuring
762
/// that only the "side-effect skeleton" remains, and also
763
/// optimize the pure nodes. This is the first step of
764
/// egraph-based processing and turns the pure CFG-based CLIF into
765
/// a CFG skeleton with a sea of (optimized) nodes tying it
766
/// together.
767
///
768
/// As we walk through the code, we eagerly apply optimization
769
/// rules; at any given point we have a "latest version" of an
770
/// eclass of possible representations for a `Value` in the
771
/// original program, which is itself a `Value` at the root of a
772
/// union-tree. We keep a map from the original values to these
773
/// optimized values. When we encounter any instruction (pure or
774
/// side-effecting skeleton) we rewrite its arguments to capture
775
/// the "latest" optimized forms of these values. (We need to do
776
/// this as part of this pass, and not later using a finished map,
777
/// because the eclass can continue to be updated and we need to
778
/// only refer to its subset that exists at this stage, to
779
/// maintain acyclicity.)
780
fn remove_pure_and_optimize(&mut self) {
781
let mut cursor = FuncCursor::new(self.func);
782
let mut value_to_opt_value: SecondaryMap<Value, Value> =
783
SecondaryMap::with_default(Value::reserved_value());
784
785
// Map from instruction to value for hash-consing of pure ops
786
// into the egraph. This can be a standard (non-scoped)
787
// hashmap because pure ops have no location: they are
788
// "outside of" control flow.
789
//
790
// Note also that we keep the controlling typevar (the `Type`
791
// in the tuple below) because it may disambiguate
792
// instructions that are identical except for type.
793
//
794
// We store both skeleton and non-skeleton instructions in the
795
// GVN map; for skeleton instructions, we only store those
796
// that are idempotent, i.e., still eligible to GVN. Note that
797
// some skeleton instructions are idempotent but do not
798
// produce a value: e.g., traps on a given condition. To allow
799
// for both cases, we store an `Option<Value>` as the value in
800
// this map.
801
let mut gvn_map: ScopedHashMap<(Type, InstructionData), Option<Value>> =
802
ScopedHashMap::with_capacity(cursor.func.dfg.num_values());
803
804
// The block in the domtree preorder traversal at each level
805
// of the GVN map.
806
let mut gvn_map_blocks: Vec<Block> = vec![];
807
808
// To get the best possible merging and canonicalization, we
809
// track where a value is "available" at: this is the
810
// domtree-nearest-ancestor join of all args if the value
811
// itself is pure, otherwise the block where the value is
812
// defined. (And for union nodes, the
813
// domtree-highest-ancestor, i.e., the meet or the dual to the
814
// above join.)
815
let mut available_block: SecondaryMap<Value, Block> =
816
SecondaryMap::with_default(Block::reserved_value());
817
818
// To avoid blowing up eclasses too much, we track the size of
819
// each eclass reachable by a tree of union nodes from a given
820
// value ID, and we avoid union'ing additional values into an
821
// eclass when it reaches `ECLASS_ENODE_LIMIT`.
822
//
823
// For efficiency, this encodes size minus one: so a value of
824
// zero (which is cheap to bulk-initialize) means a singleton
825
// eclass of size one. This also allows us to avoid explicitly
826
// writing the size for any values that are not union nodes.
827
let mut eclass_size: SecondaryMap<Value, u8> = SecondaryMap::with_default(0);
828
829
// This is an initial guess at the size we'll need, but we add
830
// more values as we build simplified alternative expressions so
831
// this is likely to realloc again later.
832
available_block.resize(cursor.func.dfg.num_values());
833
834
// In domtree preorder, visit blocks. (TODO: factor out an
835
// iterator from this and elaborator.)
836
let root = cursor.layout().entry_block().unwrap();
837
enum StackEntry {
838
Visit(Block),
839
Pop,
840
}
841
let mut block_stack = vec![StackEntry::Visit(root)];
842
while let Some(entry) = block_stack.pop() {
843
match entry {
844
StackEntry::Visit(block) => {
845
// We popped this block; push children
846
// immediately, then process this block.
847
block_stack.push(StackEntry::Pop);
848
block_stack.extend(
849
self.ctrl_plane
850
.shuffled(self.domtree.children(block))
851
.map(StackEntry::Visit),
852
);
853
gvn_map.increment_depth();
854
gvn_map_blocks.push(block);
855
856
trace!("Processing block {}", block);
857
cursor.set_position(CursorPosition::Before(block));
858
859
let mut alias_analysis_state = self.alias_analysis.block_starting_state(block);
860
861
for &param in cursor.func.dfg.block_params(block) {
862
trace!("creating initial singleton eclass for blockparam {}", param);
863
value_to_opt_value[param] = param;
864
available_block[param] = block;
865
}
866
while let Some(inst) = cursor.next_inst() {
867
trace!(
868
"Processing inst {inst}: {}",
869
cursor.func.dfg.display_inst(inst),
870
);
871
872
// Rewrite args of *all* instructions using the
873
// value-to-opt-value map.
874
cursor.func.dfg.map_inst_values(inst, |arg| {
875
let new_value = value_to_opt_value[arg];
876
trace!("rewriting arg {} of inst {} to {}", arg, inst, new_value);
877
debug_assert_ne!(new_value, Value::reserved_value());
878
new_value
879
});
880
881
// Build a context for optimization, with borrows of
882
// state. We can't invoke a method on `self` because
883
// we've borrowed `self.func` mutably (as
884
// `cursor.func`) so we pull apart the pieces instead
885
// here.
886
let mut ctx = OptimizeCtx {
887
func: cursor.func,
888
value_to_opt_value: &mut value_to_opt_value,
889
gvn_map: &mut gvn_map,
890
gvn_map_blocks: &mut gvn_map_blocks,
891
available_block: &mut available_block,
892
eclass_size: &mut eclass_size,
893
rewrite_depth: 0,
894
subsume_values: FxHashSet::default(),
895
remat_values: &mut self.remat_values,
896
stats: &mut self.stats,
897
domtree: &self.domtree,
898
alias_analysis: self.alias_analysis,
899
alias_analysis_state: &mut alias_analysis_state,
900
flags: self.flags,
901
ctrl_plane: self.ctrl_plane,
902
optimized_values: Default::default(),
903
optimized_insts: Default::default(),
904
};
905
906
if is_pure_for_egraph(ctx.func, inst) {
907
// Insert into GVN map and optimize any new nodes
908
// inserted (recursively performing this work for
909
// any nodes the optimization rules produce).
910
let inst = NewOrExistingInst::Existing(inst);
911
ctx.insert_pure_enode(inst);
912
// We've now rewritten all uses, or will when we
913
// see them, and the instruction exists as a pure
914
// enode in the eclass, so we can remove it.
915
cursor.remove_inst_and_step_back();
916
} else {
917
if let Some(cmd) = ctx.optimize_skeleton_inst(inst, block) {
918
Self::execute_skeleton_inst_simplification(
919
cmd,
920
&mut cursor,
921
&mut value_to_opt_value,
922
inst,
923
);
924
}
925
}
926
}
927
}
928
StackEntry::Pop => {
929
gvn_map.decrement_depth();
930
gvn_map_blocks.pop();
931
}
932
}
933
}
934
}
935
936
/// Execute a simplification of an instruction in the side-effectful
937
/// skeleton.
938
fn execute_skeleton_inst_simplification(
939
simplification: SkeletonInstSimplification,
940
cursor: &mut FuncCursor,
941
value_to_opt_value: &mut SecondaryMap<Value, Value>,
942
old_inst: Inst,
943
) {
944
let mut forward_val = |cursor: &mut FuncCursor, old_val, new_val| {
945
cursor.func.dfg.change_to_alias(old_val, new_val);
946
value_to_opt_value[old_val] = new_val;
947
};
948
949
let (new_inst, new_val) = match simplification {
950
SkeletonInstSimplification::Remove => {
951
cursor.remove_inst_and_step_back();
952
return;
953
}
954
SkeletonInstSimplification::RemoveWithVal { val } => {
955
cursor.remove_inst_and_step_back();
956
let old_val = cursor.func.dfg.first_result(old_inst);
957
cursor.func.dfg.detach_inst_results(old_inst);
958
forward_val(cursor, old_val, val);
959
return;
960
}
961
SkeletonInstSimplification::Replace { inst } => (inst, None),
962
SkeletonInstSimplification::ReplaceWithVal { inst, val } => (inst, Some(val)),
963
};
964
965
// Replace the old instruction with the new one.
966
cursor.replace_inst(new_inst);
967
debug_assert!(!cursor.func.dfg.insts[new_inst].opcode().is_terminator());
968
969
// Redirect the old instruction's result values to our new instruction's
970
// result values.
971
let mut i = 0;
972
let mut next_new_val = |dfg: &crate::ir::DataFlowGraph| -> Value {
973
if let Some(val) = new_val {
974
val
975
} else {
976
let val = dfg.inst_results(new_inst)[i];
977
i += 1;
978
val
979
}
980
};
981
for i in 0..cursor.func.dfg.inst_results(old_inst).len() {
982
let old_val = cursor.func.dfg.inst_results(old_inst)[i];
983
let new_val = next_new_val(&cursor.func.dfg);
984
forward_val(cursor, old_val, new_val);
985
}
986
987
// Back up so that the next iteration of the outer egraph loop will
988
// process the new instruction.
989
cursor.goto_inst(new_inst);
990
cursor.prev_inst();
991
}
992
993
/// Scoped elaboration: compute a final ordering of op computation
994
/// for each block and update the given Func body. After this
995
/// runs, the function body is back into the state where every
996
/// Inst with an used result is placed in the layout (possibly
997
/// duplicated, if our code-motion logic decides this is the best
998
/// option).
999
///
1000
/// This works in concert with the domtree. We do a preorder
1001
/// traversal of the domtree, tracking a scoped map from Id to
1002
/// (new) Value. The map's scopes correspond to levels in the
1003
/// domtree.
1004
///
1005
/// At each block, we iterate forward over the side-effecting
1006
/// eclasses, and recursively generate their arg eclasses, then
1007
/// emit the ops themselves.
1008
///
1009
/// To use an eclass in a given block, we first look it up in the
1010
/// scoped map, and get the Value if already present. If not, we
1011
/// need to generate it. We emit the extracted enode for this
1012
/// eclass after recursively generating its args. Eclasses are
1013
/// thus computed "as late as possible", but then memoized into
1014
/// the Id-to-Value map and available to all dominated blocks and
1015
/// for the rest of this block. (This subsumes GVN.)
1016
fn elaborate(&mut self) {
1017
let mut elaborator = Elaborator::new(
1018
self.func,
1019
&self.domtree,
1020
self.loop_analysis,
1021
&self.remat_values,
1022
&mut self.stats,
1023
self.ctrl_plane,
1024
);
1025
elaborator.elaborate();
1026
1027
self.check_post_egraph();
1028
}
1029
1030
#[cfg(debug_assertions)]
1031
fn check_post_egraph(&self) {
1032
// Verify that no union nodes are reachable from inst args,
1033
// and that all inst args' defining instructions are in the
1034
// layout.
1035
for block in self.func.layout.blocks() {
1036
for inst in self.func.layout.block_insts(block) {
1037
self.func
1038
.dfg
1039
.inst_values(inst)
1040
.for_each(|arg| match self.func.dfg.value_def(arg) {
1041
ValueDef::Result(i, _) => {
1042
debug_assert!(self.func.layout.inst_block(i).is_some());
1043
}
1044
ValueDef::Union(..) => {
1045
panic!("egraph union node {arg} still reachable at {inst}!");
1046
}
1047
_ => {}
1048
})
1049
}
1050
}
1051
}
1052
1053
#[cfg(not(debug_assertions))]
1054
fn check_post_egraph(&self) {}
1055
}
1056
1057
/// Implementation of external-context equality and hashing on
1058
/// InstructionData. This allows us to deduplicate instructions given
1059
/// some context that lets us see its value lists, so we don't need to
1060
/// store arguments inline in the `InstuctionData` (or alongside it in
1061
/// some newly-defined key type) in all cases.
1062
struct GVNContext<'a> {
1063
value_lists: &'a ValueListPool,
1064
}
1065
1066
impl<'a> CtxEq<(Type, InstructionData), (Type, InstructionData)> for GVNContext<'a> {
1067
fn ctx_eq(
1068
&self,
1069
(a_ty, a_inst): &(Type, InstructionData),
1070
(b_ty, b_inst): &(Type, InstructionData),
1071
) -> bool {
1072
a_ty == b_ty && a_inst.eq(b_inst, self.value_lists)
1073
}
1074
}
1075
1076
impl<'a> CtxHash<(Type, InstructionData)> for GVNContext<'a> {
1077
fn ctx_hash<H: Hasher>(&self, state: &mut H, (ty, inst): &(Type, InstructionData)) {
1078
std::hash::Hash::hash(&ty, state);
1079
inst.hash(state, self.value_lists);
1080
}
1081
}
1082
1083
/// Statistics collected during egraph-based processing.
1084
#[derive(Clone, Debug, Default)]
1085
pub(crate) struct Stats {
1086
pub(crate) pure_inst: u64,
1087
pub(crate) pure_inst_deduped: u64,
1088
pub(crate) pure_inst_subsume: u64,
1089
pub(crate) pure_inst_rewrite_to_self: u64,
1090
pub(crate) pure_inst_insert_orig: u64,
1091
pub(crate) pure_inst_insert_new: u64,
1092
pub(crate) skeleton_inst: u64,
1093
pub(crate) skeleton_inst_simplified: u64,
1094
pub(crate) skeleton_inst_gvn: u64,
1095
pub(crate) alias_analysis_removed: u64,
1096
pub(crate) new_inst: u64,
1097
pub(crate) union: u64,
1098
pub(crate) subsume: u64,
1099
pub(crate) remat: u64,
1100
pub(crate) rewrite_rule_invoked: u64,
1101
pub(crate) rewrite_rule_results: u64,
1102
pub(crate) rewrite_depth_limit: u64,
1103
pub(crate) elaborate_visit_node: u64,
1104
pub(crate) elaborate_memoize_hit: u64,
1105
pub(crate) elaborate_memoize_miss: u64,
1106
pub(crate) elaborate_remat: u64,
1107
pub(crate) elaborate_licm_hoist: u64,
1108
pub(crate) elaborate_func: u64,
1109
pub(crate) elaborate_func_pre_insts: u64,
1110
pub(crate) elaborate_func_post_insts: u64,
1111
pub(crate) elaborate_best_cost_fixpoint_iters: u64,
1112
pub(crate) eclass_size_limit: u64,
1113
}
1114
1115