CoCalc -- DeadStoreElimination.cpp

GitHub Repository: freebsd/freebsd-src
Path: blob/main/contrib/llvm-project/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp
³⁵²⁹⁴ views
1
//===- DeadStoreElimination.cpp - MemorySSA Backed Dead Store Elimination -===//
2
//
3
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4
// See https://llvm.org/LICENSE.txt for license information.
5
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6
//
7
//===----------------------------------------------------------------------===//
8
//
9
// The code below implements dead store elimination using MemorySSA. It uses
10
// the following general approach: given a MemoryDef, walk upwards to find
11
// clobbering MemoryDefs that may be killed by the starting def. Then check
12
// that there are no uses that may read the location of the original MemoryDef
13
// in between both MemoryDefs. A bit more concretely:
14
//
15
// For all MemoryDefs StartDef:
16
// 1. Get the next dominating clobbering MemoryDef (MaybeDeadAccess) by walking
17
//    upwards.
18
// 2. Check that there are no reads between MaybeDeadAccess and the StartDef by
19
//    checking all uses starting at MaybeDeadAccess and walking until we see
20
//    StartDef.
21
// 3. For each found CurrentDef, check that:
22
//   1. There are no barrier instructions between CurrentDef and StartDef (like
23
//       throws or stores with ordering constraints).
24
//   2. StartDef is executed whenever CurrentDef is executed.
25
//   3. StartDef completely overwrites CurrentDef.
26
// 4. Erase CurrentDef from the function and MemorySSA.
27
//
28
//===----------------------------------------------------------------------===//
29

30
#include "llvm/Transforms/Scalar/DeadStoreElimination.h"
31
#include "llvm/ADT/APInt.h"
32
#include "llvm/ADT/DenseMap.h"
33
#include "llvm/ADT/MapVector.h"
34
#include "llvm/ADT/PostOrderIterator.h"
35
#include "llvm/ADT/SetVector.h"
36
#include "llvm/ADT/SmallPtrSet.h"
37
#include "llvm/ADT/SmallVector.h"
38
#include "llvm/ADT/Statistic.h"
39
#include "llvm/ADT/StringRef.h"
40
#include "llvm/Analysis/AliasAnalysis.h"
41
#include "llvm/Analysis/CaptureTracking.h"
42
#include "llvm/Analysis/GlobalsModRef.h"
43
#include "llvm/Analysis/LoopInfo.h"
44
#include "llvm/Analysis/MemoryBuiltins.h"
45
#include "llvm/Analysis/MemoryLocation.h"
46
#include "llvm/Analysis/MemorySSA.h"
47
#include "llvm/Analysis/MemorySSAUpdater.h"
48
#include "llvm/Analysis/MustExecute.h"
49
#include "llvm/Analysis/PostDominators.h"
50
#include "llvm/Analysis/TargetLibraryInfo.h"
51
#include "llvm/Analysis/ValueTracking.h"
52
#include "llvm/IR/Argument.h"
53
#include "llvm/IR/BasicBlock.h"
54
#include "llvm/IR/Constant.h"
55
#include "llvm/IR/Constants.h"
56
#include "llvm/IR/DataLayout.h"
57
#include "llvm/IR/DebugInfo.h"
58
#include "llvm/IR/Dominators.h"
59
#include "llvm/IR/Function.h"
60
#include "llvm/IR/IRBuilder.h"
61
#include "llvm/IR/InstIterator.h"
62
#include "llvm/IR/InstrTypes.h"
63
#include "llvm/IR/Instruction.h"
64
#include "llvm/IR/Instructions.h"
65
#include "llvm/IR/IntrinsicInst.h"
66
#include "llvm/IR/Module.h"
67
#include "llvm/IR/PassManager.h"
68
#include "llvm/IR/PatternMatch.h"
69
#include "llvm/IR/Value.h"
70
#include "llvm/Support/Casting.h"
71
#include "llvm/Support/CommandLine.h"
72
#include "llvm/Support/Debug.h"
73
#include "llvm/Support/DebugCounter.h"
74
#include "llvm/Support/ErrorHandling.h"
75
#include "llvm/Support/raw_ostream.h"
76
#include "llvm/Transforms/Utils/AssumeBundleBuilder.h"
77
#include "llvm/Transforms/Utils/BuildLibCalls.h"
78
#include "llvm/Transforms/Utils/Local.h"
79
#include <algorithm>
80
#include <cassert>
81
#include <cstdint>
82
#include <iterator>
83
#include <map>
84
#include <optional>
85
#include <utility>
86

87
using namespace llvm;
88
using namespace PatternMatch;
89

90
#define DEBUG_TYPE "dse"
91

92
STATISTIC(NumRemainingStores, "Number of stores remaining after DSE");
93
STATISTIC(NumRedundantStores, "Number of redundant stores deleted");
94
STATISTIC(NumFastStores, "Number of stores deleted");
95
STATISTIC(NumFastOther, "Number of other instrs removed");
96
STATISTIC(NumCompletePartials, "Number of stores dead by later partials");
97
STATISTIC(NumModifiedStores, "Number of stores modified");
98
STATISTIC(NumCFGChecks, "Number of stores modified");
99
STATISTIC(NumCFGTries, "Number of stores modified");
100
STATISTIC(NumCFGSuccess, "Number of stores modified");
101
STATISTIC(NumGetDomMemoryDefPassed,
102
          "Number of times a valid candidate is returned from getDomMemoryDef");
103
STATISTIC(NumDomMemDefChecks,
104
          "Number iterations check for reads in getDomMemoryDef");
105

106
DEBUG_COUNTER(MemorySSACounter, "dse-memoryssa",
107
              "Controls which MemoryDefs are eliminated.");
108

109
static cl::opt<bool>
110
EnablePartialOverwriteTracking("enable-dse-partial-overwrite-tracking",
111
  cl::init(true), cl::Hidden,
112
  cl::desc("Enable partial-overwrite tracking in DSE"));
113

114
static cl::opt<bool>
115
EnablePartialStoreMerging("enable-dse-partial-store-merging",
116
  cl::init(true), cl::Hidden,
117
  cl::desc("Enable partial store merging in DSE"));
118

119
static cl::opt<unsigned>
120
    MemorySSAScanLimit("dse-memoryssa-scanlimit", cl::init(150), cl::Hidden,
121
                       cl::desc("The number of memory instructions to scan for "
122
                                "dead store elimination (default = 150)"));
123
static cl::opt<unsigned> MemorySSAUpwardsStepLimit(
124
    "dse-memoryssa-walklimit", cl::init(90), cl::Hidden,
125
    cl::desc("The maximum number of steps while walking upwards to find "
126
             "MemoryDefs that may be killed (default = 90)"));
127

128
static cl::opt<unsigned> MemorySSAPartialStoreLimit(
129
    "dse-memoryssa-partial-store-limit", cl::init(5), cl::Hidden,
130
    cl::desc("The maximum number candidates that only partially overwrite the "
131
             "killing MemoryDef to consider"
132
             " (default = 5)"));
133

134
static cl::opt<unsigned> MemorySSADefsPerBlockLimit(
135
    "dse-memoryssa-defs-per-block-limit", cl::init(5000), cl::Hidden,
136
    cl::desc("The number of MemoryDefs we consider as candidates to eliminated "
137
             "other stores per basic block (default = 5000)"));
138

139
static cl::opt<unsigned> MemorySSASameBBStepCost(
140
    "dse-memoryssa-samebb-cost", cl::init(1), cl::Hidden,
141
    cl::desc(
142
        "The cost of a step in the same basic block as the killing MemoryDef"
143
        "(default = 1)"));
144

145
static cl::opt<unsigned>
146
    MemorySSAOtherBBStepCost("dse-memoryssa-otherbb-cost", cl::init(5),
147
                             cl::Hidden,
148
                             cl::desc("The cost of a step in a different basic "
149
                                      "block than the killing MemoryDef"
150
                                      "(default = 5)"));
151

152
static cl::opt<unsigned> MemorySSAPathCheckLimit(
153
    "dse-memoryssa-path-check-limit", cl::init(50), cl::Hidden,
154
    cl::desc("The maximum number of blocks to check when trying to prove that "
155
             "all paths to an exit go through a killing block (default = 50)"));
156

157
// This flags allows or disallows DSE to optimize MemorySSA during its
158
// traversal. Note that DSE optimizing MemorySSA may impact other passes
159
// downstream of the DSE invocation and can lead to issues not being
160
// reproducible in isolation (i.e. when MemorySSA is built from scratch). In
161
// those cases, the flag can be used to check if DSE's MemorySSA optimizations
162
// impact follow-up passes.
163
static cl::opt<bool>
164
    OptimizeMemorySSA("dse-optimize-memoryssa", cl::init(true), cl::Hidden,
165
                      cl::desc("Allow DSE to optimize memory accesses."));
166

167
//===----------------------------------------------------------------------===//
168
// Helper functions
169
//===----------------------------------------------------------------------===//
170
using OverlapIntervalsTy = std::map<int64_t, int64_t>;
171
using InstOverlapIntervalsTy = DenseMap<Instruction *, OverlapIntervalsTy>;
172

173
/// Returns true if the end of this instruction can be safely shortened in
174
/// length.
175
static bool isShortenableAtTheEnd(Instruction *I) {
176
  // Don't shorten stores for now
177
  if (isa<StoreInst>(I))
178
    return false;
179

180
  if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
181
    switch (II->getIntrinsicID()) {
182
      default: return false;
183
      case Intrinsic::memset:
184
      case Intrinsic::memcpy:
185
      case Intrinsic::memcpy_element_unordered_atomic:
186
      case Intrinsic::memset_element_unordered_atomic:
187
        // Do shorten memory intrinsics.
188
        // FIXME: Add memmove if it's also safe to transform.
189
        return true;
190
    }
191
  }
192

193
  // Don't shorten libcalls calls for now.
194

195
  return false;
196
}
197

198
/// Returns true if the beginning of this instruction can be safely shortened
199
/// in length.
200
static bool isShortenableAtTheBeginning(Instruction *I) {
201
  // FIXME: Handle only memset for now. Supporting memcpy/memmove should be
202
  // easily done by offsetting the source address.
203
  return isa<AnyMemSetInst>(I);
204
}
205

206
static std::optional<TypeSize> getPointerSize(const Value *V,
207
                                              const DataLayout &DL,
208
                                              const TargetLibraryInfo &TLI,
209
                                              const Function *F) {
210
  uint64_t Size;
211
  ObjectSizeOpts Opts;
212
  Opts.NullIsUnknownSize = NullPointerIsDefined(F);
213

214
  if (getObjectSize(V, Size, DL, &TLI, Opts))
215
    return TypeSize::getFixed(Size);
216
  return std::nullopt;
217
}
218

219
namespace {
220

221
enum OverwriteResult {
222
  OW_Begin,
223
  OW_Complete,
224
  OW_End,
225
  OW_PartialEarlierWithFullLater,
226
  OW_MaybePartial,
227
  OW_None,
228
  OW_Unknown
229
};
230

231
} // end anonymous namespace
232

233
/// Check if two instruction are masked stores that completely
234
/// overwrite one another. More specifically, \p KillingI has to
235
/// overwrite \p DeadI.
236
static OverwriteResult isMaskedStoreOverwrite(const Instruction *KillingI,
237
                                              const Instruction *DeadI,
238
                                              BatchAAResults &AA) {
239
  const auto *KillingII = dyn_cast<IntrinsicInst>(KillingI);
240
  const auto *DeadII = dyn_cast<IntrinsicInst>(DeadI);
241
  if (KillingII == nullptr || DeadII == nullptr)
242
    return OW_Unknown;
243
  if (KillingII->getIntrinsicID() != DeadII->getIntrinsicID())
244
    return OW_Unknown;
245
  if (KillingII->getIntrinsicID() == Intrinsic::masked_store) {
246
    // Type size.
247
    VectorType *KillingTy =
248
        cast<VectorType>(KillingII->getArgOperand(0)->getType());
249
    VectorType *DeadTy = cast<VectorType>(DeadII->getArgOperand(0)->getType());
250
    if (KillingTy->getScalarSizeInBits() != DeadTy->getScalarSizeInBits())
251
      return OW_Unknown;
252
    // Element count.
253
    if (KillingTy->getElementCount() != DeadTy->getElementCount())
254
      return OW_Unknown;
255
    // Pointers.
256
    Value *KillingPtr = KillingII->getArgOperand(1)->stripPointerCasts();
257
    Value *DeadPtr = DeadII->getArgOperand(1)->stripPointerCasts();
258
    if (KillingPtr != DeadPtr && !AA.isMustAlias(KillingPtr, DeadPtr))
259
      return OW_Unknown;
260
    // Masks.
261
    // TODO: check that KillingII's mask is a superset of the DeadII's mask.
262
    if (KillingII->getArgOperand(3) != DeadII->getArgOperand(3))
263
      return OW_Unknown;
264
    return OW_Complete;
265
  }
266
  return OW_Unknown;
267
}
268

269
/// Return 'OW_Complete' if a store to the 'KillingLoc' location completely
270
/// overwrites a store to the 'DeadLoc' location, 'OW_End' if the end of the
271
/// 'DeadLoc' location is completely overwritten by 'KillingLoc', 'OW_Begin'
272
/// if the beginning of the 'DeadLoc' location is overwritten by 'KillingLoc'.
273
/// 'OW_PartialEarlierWithFullLater' means that a dead (big) store was
274
/// overwritten by a killing (smaller) store which doesn't write outside the big
275
/// store's memory locations. Returns 'OW_Unknown' if nothing can be determined.
276
/// NOTE: This function must only be called if both \p KillingLoc and \p
277
/// DeadLoc belong to the same underlying object with valid \p KillingOff and
278
/// \p DeadOff.
279
static OverwriteResult isPartialOverwrite(const MemoryLocation &KillingLoc,
280
                                          const MemoryLocation &DeadLoc,
281
                                          int64_t KillingOff, int64_t DeadOff,
282
                                          Instruction *DeadI,
283
                                          InstOverlapIntervalsTy &IOL) {
284
  const uint64_t KillingSize = KillingLoc.Size.getValue();
285
  const uint64_t DeadSize = DeadLoc.Size.getValue();
286
  // We may now overlap, although the overlap is not complete. There might also
287
  // be other incomplete overlaps, and together, they might cover the complete
288
  // dead store.
289
  // Note: The correctness of this logic depends on the fact that this function
290
  // is not even called providing DepWrite when there are any intervening reads.
291
  if (EnablePartialOverwriteTracking &&
292
      KillingOff < int64_t(DeadOff + DeadSize) &&
293
      int64_t(KillingOff + KillingSize) >= DeadOff) {
294

295
    // Insert our part of the overlap into the map.
296
    auto &IM = IOL[DeadI];
297
    LLVM_DEBUG(dbgs() << "DSE: Partial overwrite: DeadLoc [" << DeadOff << ", "
298
                      << int64_t(DeadOff + DeadSize) << ") KillingLoc ["
299
                      << KillingOff << ", " << int64_t(KillingOff + KillingSize)
300
                      << ")\n");
301

302
    // Make sure that we only insert non-overlapping intervals and combine
303
    // adjacent intervals. The intervals are stored in the map with the ending
304
    // offset as the key (in the half-open sense) and the starting offset as
305
    // the value.
306
    int64_t KillingIntStart = KillingOff;
307
    int64_t KillingIntEnd = KillingOff + KillingSize;
308

309
    // Find any intervals ending at, or after, KillingIntStart which start
310
    // before KillingIntEnd.
311
    auto ILI = IM.lower_bound(KillingIntStart);
312
    if (ILI != IM.end() && ILI->second <= KillingIntEnd) {
313
      // This existing interval is overlapped with the current store somewhere
314
      // in [KillingIntStart, KillingIntEnd]. Merge them by erasing the existing
315
      // intervals and adjusting our start and end.
316
      KillingIntStart = std::min(KillingIntStart, ILI->second);
317
      KillingIntEnd = std::max(KillingIntEnd, ILI->first);
318
      ILI = IM.erase(ILI);
319

320
      // Continue erasing and adjusting our end in case other previous
321
      // intervals are also overlapped with the current store.
322
      //
323
      // |--- dead 1 ---|  |--- dead 2 ---|
324
      //     |------- killing---------|
325
      //
326
      while (ILI != IM.end() && ILI->second <= KillingIntEnd) {
327
        assert(ILI->second > KillingIntStart && "Unexpected interval");
328
        KillingIntEnd = std::max(KillingIntEnd, ILI->first);
329
        ILI = IM.erase(ILI);
330
      }
331
    }
332

333
    IM[KillingIntEnd] = KillingIntStart;
334

335
    ILI = IM.begin();
336
    if (ILI->second <= DeadOff && ILI->first >= int64_t(DeadOff + DeadSize)) {
337
      LLVM_DEBUG(dbgs() << "DSE: Full overwrite from partials: DeadLoc ["
338
                        << DeadOff << ", " << int64_t(DeadOff + DeadSize)
339
                        << ") Composite KillingLoc [" << ILI->second << ", "
340
                        << ILI->first << ")\n");
341
      ++NumCompletePartials;
342
      return OW_Complete;
343
    }
344
  }
345

346
  // Check for a dead store which writes to all the memory locations that
347
  // the killing store writes to.
348
  if (EnablePartialStoreMerging && KillingOff >= DeadOff &&
349
      int64_t(DeadOff + DeadSize) > KillingOff &&
350
      uint64_t(KillingOff - DeadOff) + KillingSize <= DeadSize) {
351
    LLVM_DEBUG(dbgs() << "DSE: Partial overwrite a dead load [" << DeadOff
352
                      << ", " << int64_t(DeadOff + DeadSize)
353
                      << ") by a killing store [" << KillingOff << ", "
354
                      << int64_t(KillingOff + KillingSize) << ")\n");
355
    // TODO: Maybe come up with a better name?
356
    return OW_PartialEarlierWithFullLater;
357
  }
358

359
  // Another interesting case is if the killing store overwrites the end of the
360
  // dead store.
361
  //
362
  //      |--dead--|
363
  //                |--   killing   --|
364
  //
365
  // In this case we may want to trim the size of dead store to avoid
366
  // generating stores to addresses which will definitely be overwritten killing
367
  // store.
368
  if (!EnablePartialOverwriteTracking &&
369
      (KillingOff > DeadOff && KillingOff < int64_t(DeadOff + DeadSize) &&
370
       int64_t(KillingOff + KillingSize) >= int64_t(DeadOff + DeadSize)))
371
    return OW_End;
372

373
  // Finally, we also need to check if the killing store overwrites the
374
  // beginning of the dead store.
375
  //
376
  //                |--dead--|
377
  //      |--  killing  --|
378
  //
379
  // In this case we may want to move the destination address and trim the size
380
  // of dead store to avoid generating stores to addresses which will definitely
381
  // be overwritten killing store.
382
  if (!EnablePartialOverwriteTracking &&
383
      (KillingOff <= DeadOff && int64_t(KillingOff + KillingSize) > DeadOff)) {
384
    assert(int64_t(KillingOff + KillingSize) < int64_t(DeadOff + DeadSize) &&
385
           "Expect to be handled as OW_Complete");
386
    return OW_Begin;
387
  }
388
  // Otherwise, they don't completely overlap.
389
  return OW_Unknown;
390
}
391

392
/// Returns true if the memory which is accessed by the second instruction is not
393
/// modified between the first and the second instruction.
394
/// Precondition: Second instruction must be dominated by the first
395
/// instruction.
396
static bool
397
memoryIsNotModifiedBetween(Instruction *FirstI, Instruction *SecondI,
398
                           BatchAAResults &AA, const DataLayout &DL,
399
                           DominatorTree *DT) {
400
  // Do a backwards scan through the CFG from SecondI to FirstI. Look for
401
  // instructions which can modify the memory location accessed by SecondI.
402
  //
403
  // While doing the walk keep track of the address to check. It might be
404
  // different in different basic blocks due to PHI translation.
405
  using BlockAddressPair = std::pair<BasicBlock *, PHITransAddr>;
406
  SmallVector<BlockAddressPair, 16> WorkList;
407
  // Keep track of the address we visited each block with. Bail out if we
408
  // visit a block with different addresses.
409
  DenseMap<BasicBlock *, Value *> Visited;
410

411
  BasicBlock::iterator FirstBBI(FirstI);
412
  ++FirstBBI;
413
  BasicBlock::iterator SecondBBI(SecondI);
414
  BasicBlock *FirstBB = FirstI->getParent();
415
  BasicBlock *SecondBB = SecondI->getParent();
416
  MemoryLocation MemLoc;
417
  if (auto *MemSet = dyn_cast<MemSetInst>(SecondI))
418
    MemLoc = MemoryLocation::getForDest(MemSet);
419
  else
420
    MemLoc = MemoryLocation::get(SecondI);
421

422
  auto *MemLocPtr = const_cast<Value *>(MemLoc.Ptr);
423

424
  // Start checking the SecondBB.
425
  WorkList.push_back(
426
      std::make_pair(SecondBB, PHITransAddr(MemLocPtr, DL, nullptr)));
427
  bool isFirstBlock = true;
428

429
  // Check all blocks going backward until we reach the FirstBB.
430
  while (!WorkList.empty()) {
431
    BlockAddressPair Current = WorkList.pop_back_val();
432
    BasicBlock *B = Current.first;
433
    PHITransAddr &Addr = Current.second;
434
    Value *Ptr = Addr.getAddr();
435

436
    // Ignore instructions before FirstI if this is the FirstBB.
437
    BasicBlock::iterator BI = (B == FirstBB ? FirstBBI : B->begin());
438

439
    BasicBlock::iterator EI;
440
    if (isFirstBlock) {
441
      // Ignore instructions after SecondI if this is the first visit of SecondBB.
442
      assert(B == SecondBB && "first block is not the store block");
443
      EI = SecondBBI;
444
      isFirstBlock = false;
445
    } else {
446
      // It's not SecondBB or (in case of a loop) the second visit of SecondBB.
447
      // In this case we also have to look at instructions after SecondI.
448
      EI = B->end();
449
    }
450
    for (; BI != EI; ++BI) {
451
      Instruction *I = &*BI;
452
      if (I->mayWriteToMemory() && I != SecondI)
453
        if (isModSet(AA.getModRefInfo(I, MemLoc.getWithNewPtr(Ptr))))
454
          return false;
455
    }
456
    if (B != FirstBB) {
457
      assert(B != &FirstBB->getParent()->getEntryBlock() &&
458
          "Should not hit the entry block because SI must be dominated by LI");
459
      for (BasicBlock *Pred : predecessors(B)) {
460
        PHITransAddr PredAddr = Addr;
461
        if (PredAddr.needsPHITranslationFromBlock(B)) {
462
          if (!PredAddr.isPotentiallyPHITranslatable())
463
            return false;
464
          if (!PredAddr.translateValue(B, Pred, DT, false))
465
            return false;
466
        }
467
        Value *TranslatedPtr = PredAddr.getAddr();
468
        auto Inserted = Visited.insert(std::make_pair(Pred, TranslatedPtr));
469
        if (!Inserted.second) {
470
          // We already visited this block before. If it was with a different
471
          // address - bail out!
472
          if (TranslatedPtr != Inserted.first->second)
473
            return false;
474
          // ... otherwise just skip it.
475
          continue;
476
        }
477
        WorkList.push_back(std::make_pair(Pred, PredAddr));
478
      }
479
    }
480
  }
481
  return true;
482
}
483

484
static void shortenAssignment(Instruction *Inst, Value *OriginalDest,
485
                              uint64_t OldOffsetInBits, uint64_t OldSizeInBits,
486
                              uint64_t NewSizeInBits, bool IsOverwriteEnd) {
487
  const DataLayout &DL = Inst->getDataLayout();
488
  uint64_t DeadSliceSizeInBits = OldSizeInBits - NewSizeInBits;
489
  uint64_t DeadSliceOffsetInBits =
490
      OldOffsetInBits + (IsOverwriteEnd ? NewSizeInBits : 0);
491
  auto SetDeadFragExpr = [](auto *Assign,
492
                            DIExpression::FragmentInfo DeadFragment) {
493
    // createFragmentExpression expects an offset relative to the existing
494
    // fragment offset if there is one.
495
    uint64_t RelativeOffset = DeadFragment.OffsetInBits -
496
                              Assign->getExpression()
497
                                  ->getFragmentInfo()
498
                                  .value_or(DIExpression::FragmentInfo(0, 0))
499
                                  .OffsetInBits;
500
    if (auto NewExpr = DIExpression::createFragmentExpression(
501
            Assign->getExpression(), RelativeOffset, DeadFragment.SizeInBits)) {
502
      Assign->setExpression(*NewExpr);
503
      return;
504
    }
505
    // Failed to create a fragment expression for this so discard the value,
506
    // making this a kill location.
507
    auto *Expr = *DIExpression::createFragmentExpression(
508
        DIExpression::get(Assign->getContext(), std::nullopt),
509
        DeadFragment.OffsetInBits, DeadFragment.SizeInBits);
510
    Assign->setExpression(Expr);
511
    Assign->setKillLocation();
512
  };
513

514
  // A DIAssignID to use so that the inserted dbg.assign intrinsics do not
515
  // link to any instructions. Created in the loop below (once).
516
  DIAssignID *LinkToNothing = nullptr;
517
  LLVMContext &Ctx = Inst->getContext();
518
  auto GetDeadLink = [&Ctx, &LinkToNothing]() {
519
    if (!LinkToNothing)
520
      LinkToNothing = DIAssignID::getDistinct(Ctx);
521
    return LinkToNothing;
522
  };
523

524
  // Insert an unlinked dbg.assign intrinsic for the dead fragment after each
525
  // overlapping dbg.assign intrinsic. The loop invalidates the iterators
526
  // returned by getAssignmentMarkers so save a copy of the markers to iterate
527
  // over.
528
  auto LinkedRange = at::getAssignmentMarkers(Inst);
529
  SmallVector<DbgVariableRecord *> LinkedDVRAssigns =
530
      at::getDVRAssignmentMarkers(Inst);
531
  SmallVector<DbgAssignIntrinsic *> Linked(LinkedRange.begin(),
532
                                           LinkedRange.end());
533
  auto InsertAssignForOverlap = [&](auto *Assign) {
534
    std::optional<DIExpression::FragmentInfo> NewFragment;
535
    if (!at::calculateFragmentIntersect(DL, OriginalDest, DeadSliceOffsetInBits,
536
                                        DeadSliceSizeInBits, Assign,
537
                                        NewFragment) ||
538
        !NewFragment) {
539
      // We couldn't calculate the intersecting fragment for some reason. Be
540
      // cautious and unlink the whole assignment from the store.
541
      Assign->setKillAddress();
542
      Assign->setAssignId(GetDeadLink());
543
      return;
544
    }
545
    // No intersect.
546
    if (NewFragment->SizeInBits == 0)
547
      return;
548

549
    // Fragments overlap: insert a new dbg.assign for this dead part.
550
    auto *NewAssign = static_cast<decltype(Assign)>(Assign->clone());
551
    NewAssign->insertAfter(Assign);
552
    NewAssign->setAssignId(GetDeadLink());
553
    if (NewFragment)
554
      SetDeadFragExpr(NewAssign, *NewFragment);
555
    NewAssign->setKillAddress();
556
  };
557
  for_each(Linked, InsertAssignForOverlap);
558
  for_each(LinkedDVRAssigns, InsertAssignForOverlap);
559
}
560

561
static bool tryToShorten(Instruction *DeadI, int64_t &DeadStart,
562
                         uint64_t &DeadSize, int64_t KillingStart,
563
                         uint64_t KillingSize, bool IsOverwriteEnd) {
564
  auto *DeadIntrinsic = cast<AnyMemIntrinsic>(DeadI);
565
  Align PrefAlign = DeadIntrinsic->getDestAlign().valueOrOne();
566

567
  // We assume that memet/memcpy operates in chunks of the "largest" native
568
  // type size and aligned on the same value. That means optimal start and size
569
  // of memset/memcpy should be modulo of preferred alignment of that type. That
570
  // is it there is no any sense in trying to reduce store size any further
571
  // since any "extra" stores comes for free anyway.
572
  // On the other hand, maximum alignment we can achieve is limited by alignment
573
  // of initial store.
574

575
  // TODO: Limit maximum alignment by preferred (or abi?) alignment of the
576
  // "largest" native type.
577
  // Note: What is the proper way to get that value?
578
  // Should TargetTransformInfo::getRegisterBitWidth be used or anything else?
579
  // PrefAlign = std::min(DL.getPrefTypeAlign(LargestType), PrefAlign);
580

581
  int64_t ToRemoveStart = 0;
582
  uint64_t ToRemoveSize = 0;
583
  // Compute start and size of the region to remove. Make sure 'PrefAlign' is
584
  // maintained on the remaining store.
585
  if (IsOverwriteEnd) {
586
    // Calculate required adjustment for 'KillingStart' in order to keep
587
    // remaining store size aligned on 'PerfAlign'.
588
    uint64_t Off =
589
        offsetToAlignment(uint64_t(KillingStart - DeadStart), PrefAlign);
590
    ToRemoveStart = KillingStart + Off;
591
    if (DeadSize <= uint64_t(ToRemoveStart - DeadStart))
592
      return false;
593
    ToRemoveSize = DeadSize - uint64_t(ToRemoveStart - DeadStart);
594
  } else {
595
    ToRemoveStart = DeadStart;
596
    assert(KillingSize >= uint64_t(DeadStart - KillingStart) &&
597
           "Not overlapping accesses?");
598
    ToRemoveSize = KillingSize - uint64_t(DeadStart - KillingStart);
599
    // Calculate required adjustment for 'ToRemoveSize'in order to keep
600
    // start of the remaining store aligned on 'PerfAlign'.
601
    uint64_t Off = offsetToAlignment(ToRemoveSize, PrefAlign);
602
    if (Off != 0) {
603
      if (ToRemoveSize <= (PrefAlign.value() - Off))
604
        return false;
605
      ToRemoveSize -= PrefAlign.value() - Off;
606
    }
607
    assert(isAligned(PrefAlign, ToRemoveSize) &&
608
           "Should preserve selected alignment");
609
  }
610

611
  assert(ToRemoveSize > 0 && "Shouldn't reach here if nothing to remove");
612
  assert(DeadSize > ToRemoveSize && "Can't remove more than original size");
613

614
  uint64_t NewSize = DeadSize - ToRemoveSize;
615
  if (auto *AMI = dyn_cast<AtomicMemIntrinsic>(DeadI)) {
616
    // When shortening an atomic memory intrinsic, the newly shortened
617
    // length must remain an integer multiple of the element size.
618
    const uint32_t ElementSize = AMI->getElementSizeInBytes();
619
    if (0 != NewSize % ElementSize)
620
      return false;
621
  }
622

623
  LLVM_DEBUG(dbgs() << "DSE: Remove Dead Store:\n  OW "
624
                    << (IsOverwriteEnd ? "END" : "BEGIN") << ": " << *DeadI
625
                    << "\n  KILLER [" << ToRemoveStart << ", "
626
                    << int64_t(ToRemoveStart + ToRemoveSize) << ")\n");
627

628
  Value *DeadWriteLength = DeadIntrinsic->getLength();
629
  Value *TrimmedLength = ConstantInt::get(DeadWriteLength->getType(), NewSize);
630
  DeadIntrinsic->setLength(TrimmedLength);
631
  DeadIntrinsic->setDestAlignment(PrefAlign);
632

633
  Value *OrigDest = DeadIntrinsic->getRawDest();
634
  if (!IsOverwriteEnd) {
635
    Value *Indices[1] = {
636
        ConstantInt::get(DeadWriteLength->getType(), ToRemoveSize)};
637
    Instruction *NewDestGEP = GetElementPtrInst::CreateInBounds(
638
        Type::getInt8Ty(DeadIntrinsic->getContext()), OrigDest, Indices, "",
639
        DeadI->getIterator());
640
    NewDestGEP->setDebugLoc(DeadIntrinsic->getDebugLoc());
641
    DeadIntrinsic->setDest(NewDestGEP);
642
  }
643

644
  // Update attached dbg.assign intrinsics. Assume 8-bit byte.
645
  shortenAssignment(DeadI, OrigDest, DeadStart * 8, DeadSize * 8, NewSize * 8,
646
                    IsOverwriteEnd);
647

648
  // Finally update start and size of dead access.
649
  if (!IsOverwriteEnd)
650
    DeadStart += ToRemoveSize;
651
  DeadSize = NewSize;
652

653
  return true;
654
}
655

656
static bool tryToShortenEnd(Instruction *DeadI, OverlapIntervalsTy &IntervalMap,
657
                            int64_t &DeadStart, uint64_t &DeadSize) {
658
  if (IntervalMap.empty() || !isShortenableAtTheEnd(DeadI))
659
    return false;
660

661
  OverlapIntervalsTy::iterator OII = --IntervalMap.end();
662
  int64_t KillingStart = OII->second;
663
  uint64_t KillingSize = OII->first - KillingStart;
664

665
  assert(OII->first - KillingStart >= 0 && "Size expected to be positive");
666

667
  if (KillingStart > DeadStart &&
668
      // Note: "KillingStart - KillingStart" is known to be positive due to
669
      // preceding check.
670
      (uint64_t)(KillingStart - DeadStart) < DeadSize &&
671
      // Note: "DeadSize - (uint64_t)(KillingStart - DeadStart)" is known to
672
      // be non negative due to preceding checks.
673
      KillingSize >= DeadSize - (uint64_t)(KillingStart - DeadStart)) {
674
    if (tryToShorten(DeadI, DeadStart, DeadSize, KillingStart, KillingSize,
675
                     true)) {
676
      IntervalMap.erase(OII);
677
      return true;
678
    }
679
  }
680
  return false;
681
}
682

683
static bool tryToShortenBegin(Instruction *DeadI,
684
                              OverlapIntervalsTy &IntervalMap,
685
                              int64_t &DeadStart, uint64_t &DeadSize) {
686
  if (IntervalMap.empty() || !isShortenableAtTheBeginning(DeadI))
687
    return false;
688

689
  OverlapIntervalsTy::iterator OII = IntervalMap.begin();
690
  int64_t KillingStart = OII->second;
691
  uint64_t KillingSize = OII->first - KillingStart;
692

693
  assert(OII->first - KillingStart >= 0 && "Size expected to be positive");
694

695
  if (KillingStart <= DeadStart &&
696
      // Note: "DeadStart - KillingStart" is known to be non negative due to
697
      // preceding check.
698
      KillingSize > (uint64_t)(DeadStart - KillingStart)) {
699
    // Note: "KillingSize - (uint64_t)(DeadStart - DeadStart)" is known to
700
    // be positive due to preceding checks.
701
    assert(KillingSize - (uint64_t)(DeadStart - KillingStart) < DeadSize &&
702
           "Should have been handled as OW_Complete");
703
    if (tryToShorten(DeadI, DeadStart, DeadSize, KillingStart, KillingSize,
704
                     false)) {
705
      IntervalMap.erase(OII);
706
      return true;
707
    }
708
  }
709
  return false;
710
}
711

712
static Constant *
713
tryToMergePartialOverlappingStores(StoreInst *KillingI, StoreInst *DeadI,
714
                                   int64_t KillingOffset, int64_t DeadOffset,
715
                                   const DataLayout &DL, BatchAAResults &AA,
716
                                   DominatorTree *DT) {
717

718
  if (DeadI && isa<ConstantInt>(DeadI->getValueOperand()) &&
719
      DL.typeSizeEqualsStoreSize(DeadI->getValueOperand()->getType()) &&
720
      KillingI && isa<ConstantInt>(KillingI->getValueOperand()) &&
721
      DL.typeSizeEqualsStoreSize(KillingI->getValueOperand()->getType()) &&
722
      memoryIsNotModifiedBetween(DeadI, KillingI, AA, DL, DT)) {
723
    // If the store we find is:
724
    //   a) partially overwritten by the store to 'Loc'
725
    //   b) the killing store is fully contained in the dead one and
726
    //   c) they both have a constant value
727
    //   d) none of the two stores need padding
728
    // Merge the two stores, replacing the dead store's value with a
729
    // merge of both values.
730
    // TODO: Deal with other constant types (vectors, etc), and probably
731
    // some mem intrinsics (if needed)
732

733
    APInt DeadValue = cast<ConstantInt>(DeadI->getValueOperand())->getValue();
734
    APInt KillingValue =
735
        cast<ConstantInt>(KillingI->getValueOperand())->getValue();
736
    unsigned KillingBits = KillingValue.getBitWidth();
737
    assert(DeadValue.getBitWidth() > KillingValue.getBitWidth());
738
    KillingValue = KillingValue.zext(DeadValue.getBitWidth());
739

740
    // Offset of the smaller store inside the larger store
741
    unsigned BitOffsetDiff = (KillingOffset - DeadOffset) * 8;
742
    unsigned LShiftAmount =
743
        DL.isBigEndian() ? DeadValue.getBitWidth() - BitOffsetDiff - KillingBits
744
                         : BitOffsetDiff;
745
    APInt Mask = APInt::getBitsSet(DeadValue.getBitWidth(), LShiftAmount,
746
                                   LShiftAmount + KillingBits);
747
    // Clear the bits we'll be replacing, then OR with the smaller
748
    // store, shifted appropriately.
749
    APInt Merged = (DeadValue & ~Mask) | (KillingValue << LShiftAmount);
750
    LLVM_DEBUG(dbgs() << "DSE: Merge Stores:\n  Dead: " << *DeadI
751
                      << "\n  Killing: " << *KillingI
752
                      << "\n  Merged Value: " << Merged << '\n');
753
    return ConstantInt::get(DeadI->getValueOperand()->getType(), Merged);
754
  }
755
  return nullptr;
756
}
757

758
namespace {
759
// Returns true if \p I is an intrinsic that does not read or write memory.
760
bool isNoopIntrinsic(Instruction *I) {
761
  if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
762
    switch (II->getIntrinsicID()) {
763
    case Intrinsic::lifetime_start:
764
    case Intrinsic::lifetime_end:
765
    case Intrinsic::invariant_end:
766
    case Intrinsic::launder_invariant_group:
767
    case Intrinsic::assume:
768
      return true;
769
    case Intrinsic::dbg_declare:
770
    case Intrinsic::dbg_label:
771
    case Intrinsic::dbg_value:
772
      llvm_unreachable("Intrinsic should not be modeled in MemorySSA");
773
    default:
774
      return false;
775
    }
776
  }
777
  return false;
778
}
779

780
// Check if we can ignore \p D for DSE.
781
bool canSkipDef(MemoryDef *D, bool DefVisibleToCaller) {
782
  Instruction *DI = D->getMemoryInst();
783
  // Calls that only access inaccessible memory cannot read or write any memory
784
  // locations we consider for elimination.
785
  if (auto *CB = dyn_cast<CallBase>(DI))
786
    if (CB->onlyAccessesInaccessibleMemory())
787
      return true;
788

789
  // We can eliminate stores to locations not visible to the caller across
790
  // throwing instructions.
791
  if (DI->mayThrow() && !DefVisibleToCaller)
792
    return true;
793

794
  // We can remove the dead stores, irrespective of the fence and its ordering
795
  // (release/acquire/seq_cst). Fences only constraints the ordering of
796
  // already visible stores, it does not make a store visible to other
797
  // threads. So, skipping over a fence does not change a store from being
798
  // dead.
799
  if (isa<FenceInst>(DI))
800
    return true;
801

802
  // Skip intrinsics that do not really read or modify memory.
803
  if (isNoopIntrinsic(DI))
804
    return true;
805

806
  return false;
807
}
808

809
struct DSEState {
810
  Function &F;
811
  AliasAnalysis &AA;
812
  EarliestEscapeInfo EI;
813

814
  /// The single BatchAA instance that is used to cache AA queries. It will
815
  /// not be invalidated over the whole run. This is safe, because:
816
  /// 1. Only memory writes are removed, so the alias cache for memory
817
  ///    locations remains valid.
818
  /// 2. No new instructions are added (only instructions removed), so cached
819
  ///    information for a deleted value cannot be accessed by a re-used new
820
  ///    value pointer.
821
  BatchAAResults BatchAA;
822

823
  MemorySSA &MSSA;
824
  DominatorTree &DT;
825
  PostDominatorTree &PDT;
826
  const TargetLibraryInfo &TLI;
827
  const DataLayout &DL;
828
  const LoopInfo &LI;
829

830
  // Whether the function contains any irreducible control flow, useful for
831
  // being accurately able to detect loops.
832
  bool ContainsIrreducibleLoops;
833

834
  // All MemoryDefs that potentially could kill other MemDefs.
835
  SmallVector<MemoryDef *, 64> MemDefs;
836
  // Any that should be skipped as they are already deleted
837
  SmallPtrSet<MemoryAccess *, 4> SkipStores;
838
  // Keep track whether a given object is captured before return or not.
839
  DenseMap<const Value *, bool> CapturedBeforeReturn;
840
  // Keep track of all of the objects that are invisible to the caller after
841
  // the function returns.
842
  DenseMap<const Value *, bool> InvisibleToCallerAfterRet;
843
  // Keep track of blocks with throwing instructions not modeled in MemorySSA.
844
  SmallPtrSet<BasicBlock *, 16> ThrowingBlocks;
845
  // Post-order numbers for each basic block. Used to figure out if memory
846
  // accesses are executed before another access.
847
  DenseMap<BasicBlock *, unsigned> PostOrderNumbers;
848

849
  /// Keep track of instructions (partly) overlapping with killing MemoryDefs per
850
  /// basic block.
851
  MapVector<BasicBlock *, InstOverlapIntervalsTy> IOLs;
852
  // Check if there are root nodes that are terminated by UnreachableInst.
853
  // Those roots pessimize post-dominance queries. If there are such roots,
854
  // fall back to CFG scan starting from all non-unreachable roots.
855
  bool AnyUnreachableExit;
856

857
  // Whether or not we should iterate on removing dead stores at the end of the
858
  // function due to removing a store causing a previously captured pointer to
859
  // no longer be captured.
860
  bool ShouldIterateEndOfFunctionDSE;
861

862
  /// Dead instructions to be removed at the end of DSE.
863
  SmallVector<Instruction *> ToRemove;
864

865
  // Class contains self-reference, make sure it's not copied/moved.
866
  DSEState(const DSEState &) = delete;
867
  DSEState &operator=(const DSEState &) = delete;
868

869
  DSEState(Function &F, AliasAnalysis &AA, MemorySSA &MSSA, DominatorTree &DT,
870
           PostDominatorTree &PDT, const TargetLibraryInfo &TLI,
871
           const LoopInfo &LI)
872
      : F(F), AA(AA), EI(DT, &LI), BatchAA(AA, &EI), MSSA(MSSA), DT(DT),
873
        PDT(PDT), TLI(TLI), DL(F.getDataLayout()), LI(LI) {
874
    // Collect blocks with throwing instructions not modeled in MemorySSA and
875
    // alloc-like objects.
876
    unsigned PO = 0;
877
    for (BasicBlock *BB : post_order(&F)) {
878
      PostOrderNumbers[BB] = PO++;
879
      for (Instruction &I : *BB) {
880
        MemoryAccess *MA = MSSA.getMemoryAccess(&I);
881
        if (I.mayThrow() && !MA)
882
          ThrowingBlocks.insert(I.getParent());
883

884
        auto *MD = dyn_cast_or_null<MemoryDef>(MA);
885
        if (MD && MemDefs.size() < MemorySSADefsPerBlockLimit &&
886
            (getLocForWrite(&I) || isMemTerminatorInst(&I)))
887
          MemDefs.push_back(MD);
888
      }
889
    }
890

891
    // Treat byval or inalloca arguments the same as Allocas, stores to them are
892
    // dead at the end of the function.
893
    for (Argument &AI : F.args())
894
      if (AI.hasPassPointeeByValueCopyAttr())
895
        InvisibleToCallerAfterRet.insert({&AI, true});
896

897
    // Collect whether there is any irreducible control flow in the function.
898
    ContainsIrreducibleLoops = mayContainIrreducibleControl(F, &LI);
899

900
    AnyUnreachableExit = any_of(PDT.roots(), [](const BasicBlock *E) {
901
      return isa<UnreachableInst>(E->getTerminator());
902
    });
903
  }
904

905
  static void pushMemUses(MemoryAccess *Acc,
906
                          SmallVectorImpl<MemoryAccess *> &WorkList,
907
                          SmallPtrSetImpl<MemoryAccess *> &Visited) {
908
    for (Use &U : Acc->uses()) {
909
      auto *MA = cast<MemoryAccess>(U.getUser());
910
      if (Visited.insert(MA).second)
911
        WorkList.push_back(MA);
912
    }
913
  };
914

915
  LocationSize strengthenLocationSize(const Instruction *I,
916
                                      LocationSize Size) const {
917
    if (auto *CB = dyn_cast<CallBase>(I)) {
918
      LibFunc F;
919
      if (TLI.getLibFunc(*CB, F) && TLI.has(F) &&
920
          (F == LibFunc_memset_chk || F == LibFunc_memcpy_chk)) {
921
        // Use the precise location size specified by the 3rd argument
922
        // for determining KillingI overwrites DeadLoc if it is a memset_chk
923
        // instruction. memset_chk will write either the amount specified as 3rd
924
        // argument or the function will immediately abort and exit the program.
925
        // NOTE: AA may determine NoAlias if it can prove that the access size
926
        // is larger than the allocation size due to that being UB. To avoid
927
        // returning potentially invalid NoAlias results by AA, limit the use of
928
        // the precise location size to isOverwrite.
929
        if (const auto *Len = dyn_cast<ConstantInt>(CB->getArgOperand(2)))
930
          return LocationSize::precise(Len->getZExtValue());
931
      }
932
    }
933
    return Size;
934
  }
935

936
  /// Return 'OW_Complete' if a store to the 'KillingLoc' location (by \p
937
  /// KillingI instruction) completely overwrites a store to the 'DeadLoc'
938
  /// location (by \p DeadI instruction).
939
  /// Return OW_MaybePartial if \p KillingI does not completely overwrite
940
  /// \p DeadI, but they both write to the same underlying object. In that
941
  /// case, use isPartialOverwrite to check if \p KillingI partially overwrites
942
  /// \p DeadI. Returns 'OR_None' if \p KillingI is known to not overwrite the
943
  /// \p DeadI. Returns 'OW_Unknown' if nothing can be determined.
944
  OverwriteResult isOverwrite(const Instruction *KillingI,
945
                              const Instruction *DeadI,
946
                              const MemoryLocation &KillingLoc,
947
                              const MemoryLocation &DeadLoc,
948
                              int64_t &KillingOff, int64_t &DeadOff) {
949
    // AliasAnalysis does not always account for loops. Limit overwrite checks
950
    // to dependencies for which we can guarantee they are independent of any
951
    // loops they are in.
952
    if (!isGuaranteedLoopIndependent(DeadI, KillingI, DeadLoc))
953
      return OW_Unknown;
954

955
    LocationSize KillingLocSize =
956
        strengthenLocationSize(KillingI, KillingLoc.Size);
957
    const Value *DeadPtr = DeadLoc.Ptr->stripPointerCasts();
958
    const Value *KillingPtr = KillingLoc.Ptr->stripPointerCasts();
959
    const Value *DeadUndObj = getUnderlyingObject(DeadPtr);
960
    const Value *KillingUndObj = getUnderlyingObject(KillingPtr);
961

962
    // Check whether the killing store overwrites the whole object, in which
963
    // case the size/offset of the dead store does not matter.
964
    if (DeadUndObj == KillingUndObj && KillingLocSize.isPrecise() &&
965
        isIdentifiedObject(KillingUndObj)) {
966
      std::optional<TypeSize> KillingUndObjSize =
967
          getPointerSize(KillingUndObj, DL, TLI, &F);
968
      if (KillingUndObjSize && *KillingUndObjSize == KillingLocSize.getValue())
969
        return OW_Complete;
970
    }
971

972
    // FIXME: Vet that this works for size upper-bounds. Seems unlikely that we'll
973
    // get imprecise values here, though (except for unknown sizes).
974
    if (!KillingLocSize.isPrecise() || !DeadLoc.Size.isPrecise()) {
975
      // In case no constant size is known, try to an IR values for the number
976
      // of bytes written and check if they match.
977
      const auto *KillingMemI = dyn_cast<MemIntrinsic>(KillingI);
978
      const auto *DeadMemI = dyn_cast<MemIntrinsic>(DeadI);
979
      if (KillingMemI && DeadMemI) {
980
        const Value *KillingV = KillingMemI->getLength();
981
        const Value *DeadV = DeadMemI->getLength();
982
        if (KillingV == DeadV && BatchAA.isMustAlias(DeadLoc, KillingLoc))
983
          return OW_Complete;
984
      }
985

986
      // Masked stores have imprecise locations, but we can reason about them
987
      // to some extent.
988
      return isMaskedStoreOverwrite(KillingI, DeadI, BatchAA);
989
    }
990

991
    const TypeSize KillingSize = KillingLocSize.getValue();
992
    const TypeSize DeadSize = DeadLoc.Size.getValue();
993
    // Bail on doing Size comparison which depends on AA for now
994
    // TODO: Remove AnyScalable once Alias Analysis deal with scalable vectors
995
    const bool AnyScalable =
996
        DeadSize.isScalable() || KillingLocSize.isScalable();
997

998
    if (AnyScalable)
999
      return OW_Unknown;
1000
    // Query the alias information
1001
    AliasResult AAR = BatchAA.alias(KillingLoc, DeadLoc);
1002

1003
    // If the start pointers are the same, we just have to compare sizes to see if
1004
    // the killing store was larger than the dead store.
1005
    if (AAR == AliasResult::MustAlias) {
1006
      // Make sure that the KillingSize size is >= the DeadSize size.
1007
      if (KillingSize >= DeadSize)
1008
        return OW_Complete;
1009
    }
1010

1011
    // If we hit a partial alias we may have a full overwrite
1012
    if (AAR == AliasResult::PartialAlias && AAR.hasOffset()) {
1013
      int32_t Off = AAR.getOffset();
1014
      if (Off >= 0 && (uint64_t)Off + DeadSize <= KillingSize)
1015
        return OW_Complete;
1016
    }
1017

1018
    // If we can't resolve the same pointers to the same object, then we can't
1019
    // analyze them at all.
1020
    if (DeadUndObj != KillingUndObj) {
1021
      // Non aliasing stores to different objects don't overlap. Note that
1022
      // if the killing store is known to overwrite whole object (out of
1023
      // bounds access overwrites whole object as well) then it is assumed to
1024
      // completely overwrite any store to the same object even if they don't
1025
      // actually alias (see next check).
1026
      if (AAR == AliasResult::NoAlias)
1027
        return OW_None;
1028
      return OW_Unknown;
1029
    }
1030

1031
    // Okay, we have stores to two completely different pointers.  Try to
1032
    // decompose the pointer into a "base + constant_offset" form.  If the base
1033
    // pointers are equal, then we can reason about the two stores.
1034
    DeadOff = 0;
1035
    KillingOff = 0;
1036
    const Value *DeadBasePtr =
1037
        GetPointerBaseWithConstantOffset(DeadPtr, DeadOff, DL);
1038
    const Value *KillingBasePtr =
1039
        GetPointerBaseWithConstantOffset(KillingPtr, KillingOff, DL);
1040

1041
    // If the base pointers still differ, we have two completely different
1042
    // stores.
1043
    if (DeadBasePtr != KillingBasePtr)
1044
      return OW_Unknown;
1045

1046
    // The killing access completely overlaps the dead store if and only if
1047
    // both start and end of the dead one is "inside" the killing one:
1048
    //    |<->|--dead--|<->|
1049
    //    |-----killing------|
1050
    // Accesses may overlap if and only if start of one of them is "inside"
1051
    // another one:
1052
    //    |<->|--dead--|<-------->|
1053
    //    |-------killing--------|
1054
    //           OR
1055
    //    |-------dead-------|
1056
    //    |<->|---killing---|<----->|
1057
    //
1058
    // We have to be careful here as *Off is signed while *.Size is unsigned.
1059

1060
    // Check if the dead access starts "not before" the killing one.
1061
    if (DeadOff >= KillingOff) {
1062
      // If the dead access ends "not after" the killing access then the
1063
      // dead one is completely overwritten by the killing one.
1064
      if (uint64_t(DeadOff - KillingOff) + DeadSize <= KillingSize)
1065
        return OW_Complete;
1066
      // If start of the dead access is "before" end of the killing access
1067
      // then accesses overlap.
1068
      else if ((uint64_t)(DeadOff - KillingOff) < KillingSize)
1069
        return OW_MaybePartial;
1070
    }
1071
    // If start of the killing access is "before" end of the dead access then
1072
    // accesses overlap.
1073
    else if ((uint64_t)(KillingOff - DeadOff) < DeadSize) {
1074
      return OW_MaybePartial;
1075
    }
1076

1077
    // Can reach here only if accesses are known not to overlap.
1078
    return OW_None;
1079
  }
1080

1081
  bool isInvisibleToCallerAfterRet(const Value *V) {
1082
    if (isa<AllocaInst>(V))
1083
      return true;
1084
    auto I = InvisibleToCallerAfterRet.insert({V, false});
1085
    if (I.second) {
1086
      if (!isInvisibleToCallerOnUnwind(V)) {
1087
        I.first->second = false;
1088
      } else if (isNoAliasCall(V)) {
1089
        I.first->second = !PointerMayBeCaptured(V, true, false);
1090
      }
1091
    }
1092
    return I.first->second;
1093
  }
1094

1095
  bool isInvisibleToCallerOnUnwind(const Value *V) {
1096
    bool RequiresNoCaptureBeforeUnwind;
1097
    if (!isNotVisibleOnUnwind(V, RequiresNoCaptureBeforeUnwind))
1098
      return false;
1099
    if (!RequiresNoCaptureBeforeUnwind)
1100
      return true;
1101

1102
    auto I = CapturedBeforeReturn.insert({V, true});
1103
    if (I.second)
1104
      // NOTE: This could be made more precise by PointerMayBeCapturedBefore
1105
      // with the killing MemoryDef. But we refrain from doing so for now to
1106
      // limit compile-time and this does not cause any changes to the number
1107
      // of stores removed on a large test set in practice.
1108
      I.first->second = PointerMayBeCaptured(V, false, true);
1109
    return !I.first->second;
1110
  }
1111

1112
  std::optional<MemoryLocation> getLocForWrite(Instruction *I) const {
1113
    if (!I->mayWriteToMemory())
1114
      return std::nullopt;
1115

1116
    if (auto *CB = dyn_cast<CallBase>(I))
1117
      return MemoryLocation::getForDest(CB, TLI);
1118

1119
    return MemoryLocation::getOrNone(I);
1120
  }
1121

1122
  /// Assuming this instruction has a dead analyzable write, can we delete
1123
  /// this instruction?
1124
  bool isRemovable(Instruction *I) {
1125
    assert(getLocForWrite(I) && "Must have analyzable write");
1126

1127
    // Don't remove volatile/atomic stores.
1128
    if (StoreInst *SI = dyn_cast<StoreInst>(I))
1129
      return SI->isUnordered();
1130

1131
    if (auto *CB = dyn_cast<CallBase>(I)) {
1132
      // Don't remove volatile memory intrinsics.
1133
      if (auto *MI = dyn_cast<MemIntrinsic>(CB))
1134
        return !MI->isVolatile();
1135

1136
      // Never remove dead lifetime intrinsics, e.g. because they are followed
1137
      // by a free.
1138
      if (CB->isLifetimeStartOrEnd())
1139
        return false;
1140

1141
      return CB->use_empty() && CB->willReturn() && CB->doesNotThrow() &&
1142
             !CB->isTerminator();
1143
    }
1144

1145
    return false;
1146
  }
1147

1148
  /// Returns true if \p UseInst completely overwrites \p DefLoc
1149
  /// (stored by \p DefInst).
1150
  bool isCompleteOverwrite(const MemoryLocation &DefLoc, Instruction *DefInst,
1151
                           Instruction *UseInst) {
1152
    // UseInst has a MemoryDef associated in MemorySSA. It's possible for a
1153
    // MemoryDef to not write to memory, e.g. a volatile load is modeled as a
1154
    // MemoryDef.
1155
    if (!UseInst->mayWriteToMemory())
1156
      return false;
1157

1158
    if (auto *CB = dyn_cast<CallBase>(UseInst))
1159
      if (CB->onlyAccessesInaccessibleMemory())
1160
        return false;
1161

1162
    int64_t InstWriteOffset, DepWriteOffset;
1163
    if (auto CC = getLocForWrite(UseInst))
1164
      return isOverwrite(UseInst, DefInst, *CC, DefLoc, InstWriteOffset,
1165
                         DepWriteOffset) == OW_Complete;
1166
    return false;
1167
  }
1168

1169
  /// Returns true if \p Def is not read before returning from the function.
1170
  bool isWriteAtEndOfFunction(MemoryDef *Def, const MemoryLocation &DefLoc) {
1171
    LLVM_DEBUG(dbgs() << "  Check if def " << *Def << " ("
1172
                      << *Def->getMemoryInst()
1173
                      << ") is at the end the function \n");
1174
    SmallVector<MemoryAccess *, 4> WorkList;
1175
    SmallPtrSet<MemoryAccess *, 8> Visited;
1176

1177
    pushMemUses(Def, WorkList, Visited);
1178
    for (unsigned I = 0; I < WorkList.size(); I++) {
1179
      if (WorkList.size() >= MemorySSAScanLimit) {
1180
        LLVM_DEBUG(dbgs() << "  ... hit exploration limit.\n");
1181
        return false;
1182
      }
1183

1184
      MemoryAccess *UseAccess = WorkList[I];
1185
      if (isa<MemoryPhi>(UseAccess)) {
1186
        // AliasAnalysis does not account for loops. Limit elimination to
1187
        // candidates for which we can guarantee they always store to the same
1188
        // memory location.
1189
        if (!isGuaranteedLoopInvariant(DefLoc.Ptr))
1190
          return false;
1191

1192
        pushMemUses(cast<MemoryPhi>(UseAccess), WorkList, Visited);
1193
        continue;
1194
      }
1195
      // TODO: Checking for aliasing is expensive. Consider reducing the amount
1196
      // of times this is called and/or caching it.
1197
      Instruction *UseInst = cast<MemoryUseOrDef>(UseAccess)->getMemoryInst();
1198
      if (isReadClobber(DefLoc, UseInst)) {
1199
        LLVM_DEBUG(dbgs() << "  ... hit read clobber " << *UseInst << ".\n");
1200
        return false;
1201
      }
1202

1203
      if (MemoryDef *UseDef = dyn_cast<MemoryDef>(UseAccess))
1204
        pushMemUses(UseDef, WorkList, Visited);
1205
    }
1206
    return true;
1207
  }
1208

1209
  /// If \p I is a memory  terminator like llvm.lifetime.end or free, return a
1210
  /// pair with the MemoryLocation terminated by \p I and a boolean flag
1211
  /// indicating whether \p I is a free-like call.
1212
  std::optional<std::pair<MemoryLocation, bool>>
1213
  getLocForTerminator(Instruction *I) const {
1214
    uint64_t Len;
1215
    Value *Ptr;
1216
    if (match(I, m_Intrinsic<Intrinsic::lifetime_end>(m_ConstantInt(Len),
1217
                                                      m_Value(Ptr))))
1218
      return {std::make_pair(MemoryLocation(Ptr, Len), false)};
1219

1220
    if (auto *CB = dyn_cast<CallBase>(I)) {
1221
      if (Value *FreedOp = getFreedOperand(CB, &TLI))
1222
        return {std::make_pair(MemoryLocation::getAfter(FreedOp), true)};
1223
    }
1224

1225
    return std::nullopt;
1226
  }
1227

1228
  /// Returns true if \p I is a memory terminator instruction like
1229
  /// llvm.lifetime.end or free.
1230
  bool isMemTerminatorInst(Instruction *I) const {
1231
    auto *CB = dyn_cast<CallBase>(I);
1232
    return CB && (CB->getIntrinsicID() == Intrinsic::lifetime_end ||
1233
                  getFreedOperand(CB, &TLI) != nullptr);
1234
  }
1235

1236
  /// Returns true if \p MaybeTerm is a memory terminator for \p Loc from
1237
  /// instruction \p AccessI.
1238
  bool isMemTerminator(const MemoryLocation &Loc, Instruction *AccessI,
1239
                       Instruction *MaybeTerm) {
1240
    std::optional<std::pair<MemoryLocation, bool>> MaybeTermLoc =
1241
        getLocForTerminator(MaybeTerm);
1242

1243
    if (!MaybeTermLoc)
1244
      return false;
1245

1246
    // If the terminator is a free-like call, all accesses to the underlying
1247
    // object can be considered terminated.
1248
    if (getUnderlyingObject(Loc.Ptr) !=
1249
        getUnderlyingObject(MaybeTermLoc->first.Ptr))
1250
      return false;
1251

1252
    auto TermLoc = MaybeTermLoc->first;
1253
    if (MaybeTermLoc->second) {
1254
      const Value *LocUO = getUnderlyingObject(Loc.Ptr);
1255
      return BatchAA.isMustAlias(TermLoc.Ptr, LocUO);
1256
    }
1257
    int64_t InstWriteOffset = 0;
1258
    int64_t DepWriteOffset = 0;
1259
    return isOverwrite(MaybeTerm, AccessI, TermLoc, Loc, InstWriteOffset,
1260
                       DepWriteOffset) == OW_Complete;
1261
  }
1262

1263
  // Returns true if \p Use may read from \p DefLoc.
1264
  bool isReadClobber(const MemoryLocation &DefLoc, Instruction *UseInst) {
1265
    if (isNoopIntrinsic(UseInst))
1266
      return false;
1267

1268
    // Monotonic or weaker atomic stores can be re-ordered and do not need to be
1269
    // treated as read clobber.
1270
    if (auto SI = dyn_cast<StoreInst>(UseInst))
1271
      return isStrongerThan(SI->getOrdering(), AtomicOrdering::Monotonic);
1272

1273
    if (!UseInst->mayReadFromMemory())
1274
      return false;
1275

1276
    if (auto *CB = dyn_cast<CallBase>(UseInst))
1277
      if (CB->onlyAccessesInaccessibleMemory())
1278
        return false;
1279

1280
    return isRefSet(BatchAA.getModRefInfo(UseInst, DefLoc));
1281
  }
1282

1283
  /// Returns true if a dependency between \p Current and \p KillingDef is
1284
  /// guaranteed to be loop invariant for the loops that they are in. Either
1285
  /// because they are known to be in the same block, in the same loop level or
1286
  /// by guaranteeing that \p CurrentLoc only references a single MemoryLocation
1287
  /// during execution of the containing function.
1288
  bool isGuaranteedLoopIndependent(const Instruction *Current,
1289
                                   const Instruction *KillingDef,
1290
                                   const MemoryLocation &CurrentLoc) {
1291
    // If the dependency is within the same block or loop level (being careful
1292
    // of irreducible loops), we know that AA will return a valid result for the
1293
    // memory dependency. (Both at the function level, outside of any loop,
1294
    // would also be valid but we currently disable that to limit compile time).
1295
    if (Current->getParent() == KillingDef->getParent())
1296
      return true;
1297
    const Loop *CurrentLI = LI.getLoopFor(Current->getParent());
1298
    if (!ContainsIrreducibleLoops && CurrentLI &&
1299
        CurrentLI == LI.getLoopFor(KillingDef->getParent()))
1300
      return true;
1301
    // Otherwise check the memory location is invariant to any loops.
1302
    return isGuaranteedLoopInvariant(CurrentLoc.Ptr);
1303
  }
1304

1305
  /// Returns true if \p Ptr is guaranteed to be loop invariant for any possible
1306
  /// loop. In particular, this guarantees that it only references a single
1307
  /// MemoryLocation during execution of the containing function.
1308
  bool isGuaranteedLoopInvariant(const Value *Ptr) {
1309
    Ptr = Ptr->stripPointerCasts();
1310
    if (auto *GEP = dyn_cast<GEPOperator>(Ptr))
1311
      if (GEP->hasAllConstantIndices())
1312
        Ptr = GEP->getPointerOperand()->stripPointerCasts();
1313

1314
    if (auto *I = dyn_cast<Instruction>(Ptr)) {
1315
      return I->getParent()->isEntryBlock() ||
1316
             (!ContainsIrreducibleLoops && !LI.getLoopFor(I->getParent()));
1317
    }
1318
    return true;
1319
  }
1320

1321
  // Find a MemoryDef writing to \p KillingLoc and dominating \p StartAccess,
1322
  // with no read access between them or on any other path to a function exit
1323
  // block if \p KillingLoc is not accessible after the function returns. If
1324
  // there is no such MemoryDef, return std::nullopt. The returned value may not
1325
  // (completely) overwrite \p KillingLoc. Currently we bail out when we
1326
  // encounter an aliasing MemoryUse (read).
1327
  std::optional<MemoryAccess *>
1328
  getDomMemoryDef(MemoryDef *KillingDef, MemoryAccess *StartAccess,
1329
                  const MemoryLocation &KillingLoc, const Value *KillingUndObj,
1330
                  unsigned &ScanLimit, unsigned &WalkerStepLimit,
1331
                  bool IsMemTerm, unsigned &PartialLimit) {
1332
    if (ScanLimit == 0 || WalkerStepLimit == 0) {
1333
      LLVM_DEBUG(dbgs() << "\n    ...  hit scan limit\n");
1334
      return std::nullopt;
1335
    }
1336

1337
    MemoryAccess *Current = StartAccess;
1338
    Instruction *KillingI = KillingDef->getMemoryInst();
1339
    LLVM_DEBUG(dbgs() << "  trying to get dominating access\n");
1340

1341
    // Only optimize defining access of KillingDef when directly starting at its
1342
    // defining access. The defining access also must only access KillingLoc. At
1343
    // the moment we only support instructions with a single write location, so
1344
    // it should be sufficient to disable optimizations for instructions that
1345
    // also read from memory.
1346
    bool CanOptimize = OptimizeMemorySSA &&
1347
                       KillingDef->getDefiningAccess() == StartAccess &&
1348
                       !KillingI->mayReadFromMemory();
1349

1350
    // Find the next clobbering Mod access for DefLoc, starting at StartAccess.
1351
    std::optional<MemoryLocation> CurrentLoc;
1352
    for (;; Current = cast<MemoryDef>(Current)->getDefiningAccess()) {
1353
      LLVM_DEBUG({
1354
        dbgs() << "   visiting " << *Current;
1355
        if (!MSSA.isLiveOnEntryDef(Current) && isa<MemoryUseOrDef>(Current))
1356
          dbgs() << " (" << *cast<MemoryUseOrDef>(Current)->getMemoryInst()
1357
                 << ")";
1358
        dbgs() << "\n";
1359
      });
1360

1361
      // Reached TOP.
1362
      if (MSSA.isLiveOnEntryDef(Current)) {
1363
        LLVM_DEBUG(dbgs() << "   ...  found LiveOnEntryDef\n");
1364
        if (CanOptimize && Current != KillingDef->getDefiningAccess())
1365
          // The first clobbering def is... none.
1366
          KillingDef->setOptimized(Current);
1367
        return std::nullopt;
1368
      }
1369

1370
      // Cost of a step. Accesses in the same block are more likely to be valid
1371
      // candidates for elimination, hence consider them cheaper.
1372
      unsigned StepCost = KillingDef->getBlock() == Current->getBlock()
1373
                              ? MemorySSASameBBStepCost
1374
                              : MemorySSAOtherBBStepCost;
1375
      if (WalkerStepLimit <= StepCost) {
1376
        LLVM_DEBUG(dbgs() << "   ...  hit walker step limit\n");
1377
        return std::nullopt;
1378
      }
1379
      WalkerStepLimit -= StepCost;
1380

1381
      // Return for MemoryPhis. They cannot be eliminated directly and the
1382
      // caller is responsible for traversing them.
1383
      if (isa<MemoryPhi>(Current)) {
1384
        LLVM_DEBUG(dbgs() << "   ...  found MemoryPhi\n");
1385
        return Current;
1386
      }
1387

1388
      // Below, check if CurrentDef is a valid candidate to be eliminated by
1389
      // KillingDef. If it is not, check the next candidate.
1390
      MemoryDef *CurrentDef = cast<MemoryDef>(Current);
1391
      Instruction *CurrentI = CurrentDef->getMemoryInst();
1392

1393
      if (canSkipDef(CurrentDef, !isInvisibleToCallerOnUnwind(KillingUndObj))) {
1394
        CanOptimize = false;
1395
        continue;
1396
      }
1397

1398
      // Before we try to remove anything, check for any extra throwing
1399
      // instructions that block us from DSEing
1400
      if (mayThrowBetween(KillingI, CurrentI, KillingUndObj)) {
1401
        LLVM_DEBUG(dbgs() << "  ... skip, may throw!\n");
1402
        return std::nullopt;
1403
      }
1404

1405
      // Check for anything that looks like it will be a barrier to further
1406
      // removal
1407
      if (isDSEBarrier(KillingUndObj, CurrentI)) {
1408
        LLVM_DEBUG(dbgs() << "  ... skip, barrier\n");
1409
        return std::nullopt;
1410
      }
1411

1412
      // If Current is known to be on path that reads DefLoc or is a read
1413
      // clobber, bail out, as the path is not profitable. We skip this check
1414
      // for intrinsic calls, because the code knows how to handle memcpy
1415
      // intrinsics.
1416
      if (!isa<IntrinsicInst>(CurrentI) && isReadClobber(KillingLoc, CurrentI))
1417
        return std::nullopt;
1418

1419
      // Quick check if there are direct uses that are read-clobbers.
1420
      if (any_of(Current->uses(), [this, &KillingLoc, StartAccess](Use &U) {
1421
            if (auto *UseOrDef = dyn_cast<MemoryUseOrDef>(U.getUser()))
1422
              return !MSSA.dominates(StartAccess, UseOrDef) &&
1423
                     isReadClobber(KillingLoc, UseOrDef->getMemoryInst());
1424
            return false;
1425
          })) {
1426
        LLVM_DEBUG(dbgs() << "   ...  found a read clobber\n");
1427
        return std::nullopt;
1428
      }
1429

1430
      // If Current does not have an analyzable write location or is not
1431
      // removable, skip it.
1432
      CurrentLoc = getLocForWrite(CurrentI);
1433
      if (!CurrentLoc || !isRemovable(CurrentI)) {
1434
        CanOptimize = false;
1435
        continue;
1436
      }
1437

1438
      // AliasAnalysis does not account for loops. Limit elimination to
1439
      // candidates for which we can guarantee they always store to the same
1440
      // memory location and not located in different loops.
1441
      if (!isGuaranteedLoopIndependent(CurrentI, KillingI, *CurrentLoc)) {
1442
        LLVM_DEBUG(dbgs() << "  ... not guaranteed loop independent\n");
1443
        CanOptimize = false;
1444
        continue;
1445
      }
1446

1447
      if (IsMemTerm) {
1448
        // If the killing def is a memory terminator (e.g. lifetime.end), check
1449
        // the next candidate if the current Current does not write the same
1450
        // underlying object as the terminator.
1451
        if (!isMemTerminator(*CurrentLoc, CurrentI, KillingI)) {
1452
          CanOptimize = false;
1453
          continue;
1454
        }
1455
      } else {
1456
        int64_t KillingOffset = 0;
1457
        int64_t DeadOffset = 0;
1458
        auto OR = isOverwrite(KillingI, CurrentI, KillingLoc, *CurrentLoc,
1459
                              KillingOffset, DeadOffset);
1460
        if (CanOptimize) {
1461
          // CurrentDef is the earliest write clobber of KillingDef. Use it as
1462
          // optimized access. Do not optimize if CurrentDef is already the
1463
          // defining access of KillingDef.
1464
          if (CurrentDef != KillingDef->getDefiningAccess() &&
1465
              (OR == OW_Complete || OR == OW_MaybePartial))
1466
            KillingDef->setOptimized(CurrentDef);
1467

1468
          // Once a may-aliasing def is encountered do not set an optimized
1469
          // access.
1470
          if (OR != OW_None)
1471
            CanOptimize = false;
1472
        }
1473

1474
        // If Current does not write to the same object as KillingDef, check
1475
        // the next candidate.
1476
        if (OR == OW_Unknown || OR == OW_None)
1477
          continue;
1478
        else if (OR == OW_MaybePartial) {
1479
          // If KillingDef only partially overwrites Current, check the next
1480
          // candidate if the partial step limit is exceeded. This aggressively
1481
          // limits the number of candidates for partial store elimination,
1482
          // which are less likely to be removable in the end.
1483
          if (PartialLimit <= 1) {
1484
            WalkerStepLimit -= 1;
1485
            LLVM_DEBUG(dbgs() << "   ... reached partial limit ... continue with next access\n");
1486
            continue;
1487
          }
1488
          PartialLimit -= 1;
1489
        }
1490
      }
1491
      break;
1492
    };
1493

1494
    // Accesses to objects accessible after the function returns can only be
1495
    // eliminated if the access is dead along all paths to the exit. Collect
1496
    // the blocks with killing (=completely overwriting MemoryDefs) and check if
1497
    // they cover all paths from MaybeDeadAccess to any function exit.
1498
    SmallPtrSet<Instruction *, 16> KillingDefs;
1499
    KillingDefs.insert(KillingDef->getMemoryInst());
1500
    MemoryAccess *MaybeDeadAccess = Current;
1501
    MemoryLocation MaybeDeadLoc = *CurrentLoc;
1502
    Instruction *MaybeDeadI = cast<MemoryDef>(MaybeDeadAccess)->getMemoryInst();
1503
    LLVM_DEBUG(dbgs() << "  Checking for reads of " << *MaybeDeadAccess << " ("
1504
                      << *MaybeDeadI << ")\n");
1505

1506
    SmallVector<MemoryAccess *, 32> WorkList;
1507
    SmallPtrSet<MemoryAccess *, 32> Visited;
1508
    pushMemUses(MaybeDeadAccess, WorkList, Visited);
1509

1510
    // Check if DeadDef may be read.
1511
    for (unsigned I = 0; I < WorkList.size(); I++) {
1512
      MemoryAccess *UseAccess = WorkList[I];
1513

1514
      LLVM_DEBUG(dbgs() << "   " << *UseAccess);
1515
      // Bail out if the number of accesses to check exceeds the scan limit.
1516
      if (ScanLimit < (WorkList.size() - I)) {
1517
        LLVM_DEBUG(dbgs() << "\n    ...  hit scan limit\n");
1518
        return std::nullopt;
1519
      }
1520
      --ScanLimit;
1521
      NumDomMemDefChecks++;
1522

1523
      if (isa<MemoryPhi>(UseAccess)) {
1524
        if (any_of(KillingDefs, [this, UseAccess](Instruction *KI) {
1525
              return DT.properlyDominates(KI->getParent(),
1526
                                          UseAccess->getBlock());
1527
            })) {
1528
          LLVM_DEBUG(dbgs() << " ... skipping, dominated by killing block\n");
1529
          continue;
1530
        }
1531
        LLVM_DEBUG(dbgs() << "\n    ... adding PHI uses\n");
1532
        pushMemUses(UseAccess, WorkList, Visited);
1533
        continue;
1534
      }
1535

1536
      Instruction *UseInst = cast<MemoryUseOrDef>(UseAccess)->getMemoryInst();
1537
      LLVM_DEBUG(dbgs() << " (" << *UseInst << ")\n");
1538

1539
      if (any_of(KillingDefs, [this, UseInst](Instruction *KI) {
1540
            return DT.dominates(KI, UseInst);
1541
          })) {
1542
        LLVM_DEBUG(dbgs() << " ... skipping, dominated by killing def\n");
1543
        continue;
1544
      }
1545

1546
      // A memory terminator kills all preceeding MemoryDefs and all succeeding
1547
      // MemoryAccesses. We do not have to check it's users.
1548
      if (isMemTerminator(MaybeDeadLoc, MaybeDeadI, UseInst)) {
1549
        LLVM_DEBUG(
1550
            dbgs()
1551
            << " ... skipping, memterminator invalidates following accesses\n");
1552
        continue;
1553
      }
1554

1555
      if (isNoopIntrinsic(cast<MemoryUseOrDef>(UseAccess)->getMemoryInst())) {
1556
        LLVM_DEBUG(dbgs() << "    ... adding uses of intrinsic\n");
1557
        pushMemUses(UseAccess, WorkList, Visited);
1558
        continue;
1559
      }
1560

1561
      if (UseInst->mayThrow() && !isInvisibleToCallerOnUnwind(KillingUndObj)) {
1562
        LLVM_DEBUG(dbgs() << "  ... found throwing instruction\n");
1563
        return std::nullopt;
1564
      }
1565

1566
      // Uses which may read the original MemoryDef mean we cannot eliminate the
1567
      // original MD. Stop walk.
1568
      if (isReadClobber(MaybeDeadLoc, UseInst)) {
1569
        LLVM_DEBUG(dbgs() << "    ... found read clobber\n");
1570
        return std::nullopt;
1571
      }
1572

1573
      // If this worklist walks back to the original memory access (and the
1574
      // pointer is not guarenteed loop invariant) then we cannot assume that a
1575
      // store kills itself.
1576
      if (MaybeDeadAccess == UseAccess &&
1577
          !isGuaranteedLoopInvariant(MaybeDeadLoc.Ptr)) {
1578
        LLVM_DEBUG(dbgs() << "    ... found not loop invariant self access\n");
1579
        return std::nullopt;
1580
      }
1581
      // Otherwise, for the KillingDef and MaybeDeadAccess we only have to check
1582
      // if it reads the memory location.
1583
      // TODO: It would probably be better to check for self-reads before
1584
      // calling the function.
1585
      if (KillingDef == UseAccess || MaybeDeadAccess == UseAccess) {
1586
        LLVM_DEBUG(dbgs() << "    ... skipping killing def/dom access\n");
1587
        continue;
1588
      }
1589

1590
      // Check all uses for MemoryDefs, except for defs completely overwriting
1591
      // the original location. Otherwise we have to check uses of *all*
1592
      // MemoryDefs we discover, including non-aliasing ones. Otherwise we might
1593
      // miss cases like the following
1594
      //   1 = Def(LoE) ; <----- DeadDef stores [0,1]
1595
      //   2 = Def(1)   ; (2, 1) = NoAlias,   stores [2,3]
1596
      //   Use(2)       ; MayAlias 2 *and* 1, loads [0, 3].
1597
      //                  (The Use points to the *first* Def it may alias)
1598
      //   3 = Def(1)   ; <---- Current  (3, 2) = NoAlias, (3,1) = MayAlias,
1599
      //                  stores [0,1]
1600
      if (MemoryDef *UseDef = dyn_cast<MemoryDef>(UseAccess)) {
1601
        if (isCompleteOverwrite(MaybeDeadLoc, MaybeDeadI, UseInst)) {
1602
          BasicBlock *MaybeKillingBlock = UseInst->getParent();
1603
          if (PostOrderNumbers.find(MaybeKillingBlock)->second <
1604
              PostOrderNumbers.find(MaybeDeadAccess->getBlock())->second) {
1605
            if (!isInvisibleToCallerAfterRet(KillingUndObj)) {
1606
              LLVM_DEBUG(dbgs()
1607
                         << "    ... found killing def " << *UseInst << "\n");
1608
              KillingDefs.insert(UseInst);
1609
            }
1610
          } else {
1611
            LLVM_DEBUG(dbgs()
1612
                       << "    ... found preceeding def " << *UseInst << "\n");
1613
            return std::nullopt;
1614
          }
1615
        } else
1616
          pushMemUses(UseDef, WorkList, Visited);
1617
      }
1618
    }
1619

1620
    // For accesses to locations visible after the function returns, make sure
1621
    // that the location is dead (=overwritten) along all paths from
1622
    // MaybeDeadAccess to the exit.
1623
    if (!isInvisibleToCallerAfterRet(KillingUndObj)) {
1624
      SmallPtrSet<BasicBlock *, 16> KillingBlocks;
1625
      for (Instruction *KD : KillingDefs)
1626
        KillingBlocks.insert(KD->getParent());
1627
      assert(!KillingBlocks.empty() &&
1628
             "Expected at least a single killing block");
1629

1630
      // Find the common post-dominator of all killing blocks.
1631
      BasicBlock *CommonPred = *KillingBlocks.begin();
1632
      for (BasicBlock *BB : llvm::drop_begin(KillingBlocks)) {
1633
        if (!CommonPred)
1634
          break;
1635
        CommonPred = PDT.findNearestCommonDominator(CommonPred, BB);
1636
      }
1637

1638
      // If the common post-dominator does not post-dominate MaybeDeadAccess,
1639
      // there is a path from MaybeDeadAccess to an exit not going through a
1640
      // killing block.
1641
      if (!PDT.dominates(CommonPred, MaybeDeadAccess->getBlock())) {
1642
        if (!AnyUnreachableExit)
1643
          return std::nullopt;
1644

1645
        // Fall back to CFG scan starting at all non-unreachable roots if not
1646
        // all paths to the exit go through CommonPred.
1647
        CommonPred = nullptr;
1648
      }
1649

1650
      // If CommonPred itself is in the set of killing blocks, we're done.
1651
      if (KillingBlocks.count(CommonPred))
1652
        return {MaybeDeadAccess};
1653

1654
      SetVector<BasicBlock *> WorkList;
1655
      // If CommonPred is null, there are multiple exits from the function.
1656
      // They all have to be added to the worklist.
1657
      if (CommonPred)
1658
        WorkList.insert(CommonPred);
1659
      else
1660
        for (BasicBlock *R : PDT.roots()) {
1661
          if (!isa<UnreachableInst>(R->getTerminator()))
1662
            WorkList.insert(R);
1663
        }
1664

1665
      NumCFGTries++;
1666
      // Check if all paths starting from an exit node go through one of the
1667
      // killing blocks before reaching MaybeDeadAccess.
1668
      for (unsigned I = 0; I < WorkList.size(); I++) {
1669
        NumCFGChecks++;
1670
        BasicBlock *Current = WorkList[I];
1671
        if (KillingBlocks.count(Current))
1672
          continue;
1673
        if (Current == MaybeDeadAccess->getBlock())
1674
          return std::nullopt;
1675

1676
        // MaybeDeadAccess is reachable from the entry, so we don't have to
1677
        // explore unreachable blocks further.
1678
        if (!DT.isReachableFromEntry(Current))
1679
          continue;
1680

1681
        for (BasicBlock *Pred : predecessors(Current))
1682
          WorkList.insert(Pred);
1683

1684
        if (WorkList.size() >= MemorySSAPathCheckLimit)
1685
          return std::nullopt;
1686
      }
1687
      NumCFGSuccess++;
1688
    }
1689

1690
    // No aliasing MemoryUses of MaybeDeadAccess found, MaybeDeadAccess is
1691
    // potentially dead.
1692
    return {MaybeDeadAccess};
1693
  }
1694

1695
  /// Delete dead memory defs and recursively add their operands to ToRemove if
1696
  /// they became dead.
1697
  void
1698
  deleteDeadInstruction(Instruction *SI,
1699
                        SmallPtrSetImpl<MemoryAccess *> *Deleted = nullptr) {
1700
    MemorySSAUpdater Updater(&MSSA);
1701
    SmallVector<Instruction *, 32> NowDeadInsts;
1702
    NowDeadInsts.push_back(SI);
1703
    --NumFastOther;
1704

1705
    while (!NowDeadInsts.empty()) {
1706
      Instruction *DeadInst = NowDeadInsts.pop_back_val();
1707
      ++NumFastOther;
1708

1709
      // Try to preserve debug information attached to the dead instruction.
1710
      salvageDebugInfo(*DeadInst);
1711
      salvageKnowledge(DeadInst);
1712

1713
      // Remove the Instruction from MSSA.
1714
      MemoryAccess *MA = MSSA.getMemoryAccess(DeadInst);
1715
      bool IsMemDef = MA && isa<MemoryDef>(MA);
1716
      if (MA) {
1717
        if (IsMemDef) {
1718
          auto *MD = cast<MemoryDef>(MA);
1719
          SkipStores.insert(MD);
1720
          if (Deleted)
1721
            Deleted->insert(MD);
1722
          if (auto *SI = dyn_cast<StoreInst>(MD->getMemoryInst())) {
1723
            if (SI->getValueOperand()->getType()->isPointerTy()) {
1724
              const Value *UO = getUnderlyingObject(SI->getValueOperand());
1725
              if (CapturedBeforeReturn.erase(UO))
1726
                ShouldIterateEndOfFunctionDSE = true;
1727
              InvisibleToCallerAfterRet.erase(UO);
1728
            }
1729
          }
1730
        }
1731

1732
        Updater.removeMemoryAccess(MA);
1733
      }
1734

1735
      auto I = IOLs.find(DeadInst->getParent());
1736
      if (I != IOLs.end())
1737
        I->second.erase(DeadInst);
1738
      // Remove its operands
1739
      for (Use &O : DeadInst->operands())
1740
        if (Instruction *OpI = dyn_cast<Instruction>(O)) {
1741
          O.set(PoisonValue::get(O->getType()));
1742
          if (isInstructionTriviallyDead(OpI, &TLI))
1743
            NowDeadInsts.push_back(OpI);
1744
        }
1745

1746
      EI.removeInstruction(DeadInst);
1747
      // Remove memory defs directly if they don't produce results, but only
1748
      // queue other dead instructions for later removal. They may have been
1749
      // used as memory locations that have been cached by BatchAA. Removing
1750
      // them here may lead to newly created instructions to be allocated at the
1751
      // same address, yielding stale cache entries.
1752
      if (IsMemDef && DeadInst->getType()->isVoidTy())
1753
        DeadInst->eraseFromParent();
1754
      else
1755
        ToRemove.push_back(DeadInst);
1756
    }
1757
  }
1758

1759
  // Check for any extra throws between \p KillingI and \p DeadI that block
1760
  // DSE.  This only checks extra maythrows (those that aren't MemoryDef's).
1761
  // MemoryDef that may throw are handled during the walk from one def to the
1762
  // next.
1763
  bool mayThrowBetween(Instruction *KillingI, Instruction *DeadI,
1764
                       const Value *KillingUndObj) {
1765
    // First see if we can ignore it by using the fact that KillingI is an
1766
    // alloca/alloca like object that is not visible to the caller during
1767
    // execution of the function.
1768
    if (KillingUndObj && isInvisibleToCallerOnUnwind(KillingUndObj))
1769
      return false;
1770

1771
    if (KillingI->getParent() == DeadI->getParent())
1772
      return ThrowingBlocks.count(KillingI->getParent());
1773
    return !ThrowingBlocks.empty();
1774
  }
1775

1776
  // Check if \p DeadI acts as a DSE barrier for \p KillingI. The following
1777
  // instructions act as barriers:
1778
  //  * A memory instruction that may throw and \p KillingI accesses a non-stack
1779
  //  object.
1780
  //  * Atomic stores stronger that monotonic.
1781
  bool isDSEBarrier(const Value *KillingUndObj, Instruction *DeadI) {
1782
    // If DeadI may throw it acts as a barrier, unless we are to an
1783
    // alloca/alloca like object that does not escape.
1784
    if (DeadI->mayThrow() && !isInvisibleToCallerOnUnwind(KillingUndObj))
1785
      return true;
1786

1787
    // If DeadI is an atomic load/store stronger than monotonic, do not try to
1788
    // eliminate/reorder it.
1789
    if (DeadI->isAtomic()) {
1790
      if (auto *LI = dyn_cast<LoadInst>(DeadI))
1791
        return isStrongerThanMonotonic(LI->getOrdering());
1792
      if (auto *SI = dyn_cast<StoreInst>(DeadI))
1793
        return isStrongerThanMonotonic(SI->getOrdering());
1794
      if (auto *ARMW = dyn_cast<AtomicRMWInst>(DeadI))
1795
        return isStrongerThanMonotonic(ARMW->getOrdering());
1796
      if (auto *CmpXchg = dyn_cast<AtomicCmpXchgInst>(DeadI))
1797
        return isStrongerThanMonotonic(CmpXchg->getSuccessOrdering()) ||
1798
               isStrongerThanMonotonic(CmpXchg->getFailureOrdering());
1799
      llvm_unreachable("other instructions should be skipped in MemorySSA");
1800
    }
1801
    return false;
1802
  }
1803

1804
  /// Eliminate writes to objects that are not visible in the caller and are not
1805
  /// accessed before returning from the function.
1806
  bool eliminateDeadWritesAtEndOfFunction() {
1807
    bool MadeChange = false;
1808
    LLVM_DEBUG(
1809
        dbgs()
1810
        << "Trying to eliminate MemoryDefs at the end of the function\n");
1811
    do {
1812
      ShouldIterateEndOfFunctionDSE = false;
1813
      for (MemoryDef *Def : llvm::reverse(MemDefs)) {
1814
        if (SkipStores.contains(Def))
1815
          continue;
1816

1817
        Instruction *DefI = Def->getMemoryInst();
1818
        auto DefLoc = getLocForWrite(DefI);
1819
        if (!DefLoc || !isRemovable(DefI)) {
1820
          LLVM_DEBUG(dbgs() << "  ... could not get location for write or "
1821
                               "instruction not removable.\n");
1822
          continue;
1823
        }
1824

1825
        // NOTE: Currently eliminating writes at the end of a function is
1826
        // limited to MemoryDefs with a single underlying object, to save
1827
        // compile-time. In practice it appears the case with multiple
1828
        // underlying objects is very uncommon. If it turns out to be important,
1829
        // we can use getUnderlyingObjects here instead.
1830
        const Value *UO = getUnderlyingObject(DefLoc->Ptr);
1831
        if (!isInvisibleToCallerAfterRet(UO))
1832
          continue;
1833

1834
        if (isWriteAtEndOfFunction(Def, *DefLoc)) {
1835
          // See through pointer-to-pointer bitcasts
1836
          LLVM_DEBUG(dbgs() << "   ... MemoryDef is not accessed until the end "
1837
                               "of the function\n");
1838
          deleteDeadInstruction(DefI);
1839
          ++NumFastStores;
1840
          MadeChange = true;
1841
        }
1842
      }
1843
    } while (ShouldIterateEndOfFunctionDSE);
1844
    return MadeChange;
1845
  }
1846

1847
  /// If we have a zero initializing memset following a call to malloc,
1848
  /// try folding it into a call to calloc.
1849
  bool tryFoldIntoCalloc(MemoryDef *Def, const Value *DefUO) {
1850
    Instruction *DefI = Def->getMemoryInst();
1851
    MemSetInst *MemSet = dyn_cast<MemSetInst>(DefI);
1852
    if (!MemSet)
1853
      // TODO: Could handle zero store to small allocation as well.
1854
      return false;
1855
    Constant *StoredConstant = dyn_cast<Constant>(MemSet->getValue());
1856
    if (!StoredConstant || !StoredConstant->isNullValue())
1857
      return false;
1858

1859
    if (!isRemovable(DefI))
1860
      // The memset might be volatile..
1861
      return false;
1862

1863
    if (F.hasFnAttribute(Attribute::SanitizeMemory) ||
1864
        F.hasFnAttribute(Attribute::SanitizeAddress) ||
1865
        F.hasFnAttribute(Attribute::SanitizeHWAddress) ||
1866
        F.getName() == "calloc")
1867
      return false;
1868
    auto *Malloc = const_cast<CallInst *>(dyn_cast<CallInst>(DefUO));
1869
    if (!Malloc)
1870
      return false;
1871
    auto *InnerCallee = Malloc->getCalledFunction();
1872
    if (!InnerCallee)
1873
      return false;
1874
    LibFunc Func;
1875
    if (!TLI.getLibFunc(*InnerCallee, Func) || !TLI.has(Func) ||
1876
        Func != LibFunc_malloc)
1877
      return false;
1878
    // Gracefully handle malloc with unexpected memory attributes.
1879
    auto *MallocDef = dyn_cast_or_null<MemoryDef>(MSSA.getMemoryAccess(Malloc));
1880
    if (!MallocDef)
1881
      return false;
1882

1883
    auto shouldCreateCalloc = [](CallInst *Malloc, CallInst *Memset) {
1884
      // Check for br(icmp ptr, null), truebb, falsebb) pattern at the end
1885
      // of malloc block
1886
      auto *MallocBB = Malloc->getParent(),
1887
        *MemsetBB = Memset->getParent();
1888
      if (MallocBB == MemsetBB)
1889
        return true;
1890
      auto *Ptr = Memset->getArgOperand(0);
1891
      auto *TI = MallocBB->getTerminator();
1892
      ICmpInst::Predicate Pred;
1893
      BasicBlock *TrueBB, *FalseBB;
1894
      if (!match(TI, m_Br(m_ICmp(Pred, m_Specific(Ptr), m_Zero()), TrueBB,
1895
                          FalseBB)))
1896
        return false;
1897
      if (Pred != ICmpInst::ICMP_EQ || MemsetBB != FalseBB)
1898
        return false;
1899
      return true;
1900
    };
1901

1902
    if (Malloc->getOperand(0) != MemSet->getLength())
1903
      return false;
1904
    if (!shouldCreateCalloc(Malloc, MemSet) ||
1905
        !DT.dominates(Malloc, MemSet) ||
1906
        !memoryIsNotModifiedBetween(Malloc, MemSet, BatchAA, DL, &DT))
1907
      return false;
1908
    IRBuilder<> IRB(Malloc);
1909
    Type *SizeTTy = Malloc->getArgOperand(0)->getType();
1910
    auto *Calloc = emitCalloc(ConstantInt::get(SizeTTy, 1),
1911
                              Malloc->getArgOperand(0), IRB, TLI);
1912
    if (!Calloc)
1913
      return false;
1914

1915
    MemorySSAUpdater Updater(&MSSA);
1916
    auto *NewAccess =
1917
      Updater.createMemoryAccessAfter(cast<Instruction>(Calloc), nullptr,
1918
                                      MallocDef);
1919
    auto *NewAccessMD = cast<MemoryDef>(NewAccess);
1920
    Updater.insertDef(NewAccessMD, /*RenameUses=*/true);
1921
    Malloc->replaceAllUsesWith(Calloc);
1922
    deleteDeadInstruction(Malloc);
1923
    return true;
1924
  }
1925

1926
  // Check if there is a dominating condition, that implies that the value
1927
  // being stored in a ptr is already present in the ptr.
1928
  bool dominatingConditionImpliesValue(MemoryDef *Def) {
1929
    auto *StoreI = cast<StoreInst>(Def->getMemoryInst());
1930
    BasicBlock *StoreBB = StoreI->getParent();
1931
    Value *StorePtr = StoreI->getPointerOperand();
1932
    Value *StoreVal = StoreI->getValueOperand();
1933

1934
    DomTreeNode *IDom = DT.getNode(StoreBB)->getIDom();
1935
    if (!IDom)
1936
      return false;
1937

1938
    auto *BI = dyn_cast<BranchInst>(IDom->getBlock()->getTerminator());
1939
    if (!BI || !BI->isConditional())
1940
      return false;
1941

1942
    // In case both blocks are the same, it is not possible to determine
1943
    // if optimization is possible. (We would not want to optimize a store
1944
    // in the FalseBB if condition is true and vice versa.)
1945
    if (BI->getSuccessor(0) == BI->getSuccessor(1))
1946
      return false;
1947

1948
    Instruction *ICmpL;
1949
    ICmpInst::Predicate Pred;
1950
    if (!match(BI->getCondition(),
1951
               m_c_ICmp(Pred,
1952
                        m_CombineAnd(m_Load(m_Specific(StorePtr)),
1953
                                     m_Instruction(ICmpL)),
1954
                        m_Specific(StoreVal))) ||
1955
        !ICmpInst::isEquality(Pred))
1956
      return false;
1957

1958
    // In case the else blocks also branches to the if block or the other way
1959
    // around it is not possible to determine if the optimization is possible.
1960
    if (Pred == ICmpInst::ICMP_EQ &&
1961
        !DT.dominates(BasicBlockEdge(BI->getParent(), BI->getSuccessor(0)),
1962
                      StoreBB))
1963
      return false;
1964

1965
    if (Pred == ICmpInst::ICMP_NE &&
1966
        !DT.dominates(BasicBlockEdge(BI->getParent(), BI->getSuccessor(1)),
1967
                      StoreBB))
1968
      return false;
1969

1970
    MemoryAccess *LoadAcc = MSSA.getMemoryAccess(ICmpL);
1971
    MemoryAccess *ClobAcc =
1972
        MSSA.getSkipSelfWalker()->getClobberingMemoryAccess(Def, BatchAA);
1973

1974
    return MSSA.dominates(ClobAcc, LoadAcc);
1975
  }
1976

1977
  /// \returns true if \p Def is a no-op store, either because it
1978
  /// directly stores back a loaded value or stores zero to a calloced object.
1979
  bool storeIsNoop(MemoryDef *Def, const Value *DefUO) {
1980
    Instruction *DefI = Def->getMemoryInst();
1981
    StoreInst *Store = dyn_cast<StoreInst>(DefI);
1982
    MemSetInst *MemSet = dyn_cast<MemSetInst>(DefI);
1983
    Constant *StoredConstant = nullptr;
1984
    if (Store)
1985
      StoredConstant = dyn_cast<Constant>(Store->getOperand(0));
1986
    else if (MemSet)
1987
      StoredConstant = dyn_cast<Constant>(MemSet->getValue());
1988
    else
1989
      return false;
1990

1991
    if (!isRemovable(DefI))
1992
      return false;
1993

1994
    if (StoredConstant) {
1995
      Constant *InitC =
1996
          getInitialValueOfAllocation(DefUO, &TLI, StoredConstant->getType());
1997
      // If the clobbering access is LiveOnEntry, no instructions between them
1998
      // can modify the memory location.
1999
      if (InitC && InitC == StoredConstant)
2000
        return MSSA.isLiveOnEntryDef(
2001
            MSSA.getSkipSelfWalker()->getClobberingMemoryAccess(Def, BatchAA));
2002
    }
2003

2004
    if (!Store)
2005
      return false;
2006

2007
    if (dominatingConditionImpliesValue(Def))
2008
      return true;
2009

2010
    if (auto *LoadI = dyn_cast<LoadInst>(Store->getOperand(0))) {
2011
      if (LoadI->getPointerOperand() == Store->getOperand(1)) {
2012
        // Get the defining access for the load.
2013
        auto *LoadAccess = MSSA.getMemoryAccess(LoadI)->getDefiningAccess();
2014
        // Fast path: the defining accesses are the same.
2015
        if (LoadAccess == Def->getDefiningAccess())
2016
          return true;
2017

2018
        // Look through phi accesses. Recursively scan all phi accesses by
2019
        // adding them to a worklist. Bail when we run into a memory def that
2020
        // does not match LoadAccess.
2021
        SetVector<MemoryAccess *> ToCheck;
2022
        MemoryAccess *Current =
2023
            MSSA.getWalker()->getClobberingMemoryAccess(Def, BatchAA);
2024
        // We don't want to bail when we run into the store memory def. But,
2025
        // the phi access may point to it. So, pretend like we've already
2026
        // checked it.
2027
        ToCheck.insert(Def);
2028
        ToCheck.insert(Current);
2029
        // Start at current (1) to simulate already having checked Def.
2030
        for (unsigned I = 1; I < ToCheck.size(); ++I) {
2031
          Current = ToCheck[I];
2032
          if (auto PhiAccess = dyn_cast<MemoryPhi>(Current)) {
2033
            // Check all the operands.
2034
            for (auto &Use : PhiAccess->incoming_values())
2035
              ToCheck.insert(cast<MemoryAccess>(&Use));
2036
            continue;
2037
          }
2038

2039
          // If we found a memory def, bail. This happens when we have an
2040
          // unrelated write in between an otherwise noop store.
2041
          assert(isa<MemoryDef>(Current) &&
2042
                 "Only MemoryDefs should reach here.");
2043
          // TODO: Skip no alias MemoryDefs that have no aliasing reads.
2044
          // We are searching for the definition of the store's destination.
2045
          // So, if that is the same definition as the load, then this is a
2046
          // noop. Otherwise, fail.
2047
          if (LoadAccess != Current)
2048
            return false;
2049
        }
2050
        return true;
2051
      }
2052
    }
2053

2054
    return false;
2055
  }
2056

2057
  bool removePartiallyOverlappedStores(InstOverlapIntervalsTy &IOL) {
2058
    bool Changed = false;
2059
    for (auto OI : IOL) {
2060
      Instruction *DeadI = OI.first;
2061
      MemoryLocation Loc = *getLocForWrite(DeadI);
2062
      assert(isRemovable(DeadI) && "Expect only removable instruction");
2063

2064
      const Value *Ptr = Loc.Ptr->stripPointerCasts();
2065
      int64_t DeadStart = 0;
2066
      uint64_t DeadSize = Loc.Size.getValue();
2067
      GetPointerBaseWithConstantOffset(Ptr, DeadStart, DL);
2068
      OverlapIntervalsTy &IntervalMap = OI.second;
2069
      Changed |= tryToShortenEnd(DeadI, IntervalMap, DeadStart, DeadSize);
2070
      if (IntervalMap.empty())
2071
        continue;
2072
      Changed |= tryToShortenBegin(DeadI, IntervalMap, DeadStart, DeadSize);
2073
    }
2074
    return Changed;
2075
  }
2076

2077
  /// Eliminates writes to locations where the value that is being written
2078
  /// is already stored at the same location.
2079
  bool eliminateRedundantStoresOfExistingValues() {
2080
    bool MadeChange = false;
2081
    LLVM_DEBUG(dbgs() << "Trying to eliminate MemoryDefs that write the "
2082
                         "already existing value\n");
2083
    for (auto *Def : MemDefs) {
2084
      if (SkipStores.contains(Def) || MSSA.isLiveOnEntryDef(Def))
2085
        continue;
2086

2087
      Instruction *DefInst = Def->getMemoryInst();
2088
      auto MaybeDefLoc = getLocForWrite(DefInst);
2089
      if (!MaybeDefLoc || !isRemovable(DefInst))
2090
        continue;
2091

2092
      MemoryDef *UpperDef;
2093
      // To conserve compile-time, we avoid walking to the next clobbering def.
2094
      // Instead, we just try to get the optimized access, if it exists. DSE
2095
      // will try to optimize defs during the earlier traversal.
2096
      if (Def->isOptimized())
2097
        UpperDef = dyn_cast<MemoryDef>(Def->getOptimized());
2098
      else
2099
        UpperDef = dyn_cast<MemoryDef>(Def->getDefiningAccess());
2100
      if (!UpperDef || MSSA.isLiveOnEntryDef(UpperDef))
2101
        continue;
2102

2103
      Instruction *UpperInst = UpperDef->getMemoryInst();
2104
      auto IsRedundantStore = [&]() {
2105
        if (DefInst->isIdenticalTo(UpperInst))
2106
          return true;
2107
        if (auto *MemSetI = dyn_cast<MemSetInst>(UpperInst)) {
2108
          if (auto *SI = dyn_cast<StoreInst>(DefInst)) {
2109
            // MemSetInst must have a write location.
2110
            auto UpperLoc = getLocForWrite(UpperInst);
2111
            if (!UpperLoc)
2112
              return false;
2113
            int64_t InstWriteOffset = 0;
2114
            int64_t DepWriteOffset = 0;
2115
            auto OR = isOverwrite(UpperInst, DefInst, *UpperLoc, *MaybeDefLoc,
2116
                                  InstWriteOffset, DepWriteOffset);
2117
            Value *StoredByte = isBytewiseValue(SI->getValueOperand(), DL);
2118
            return StoredByte && StoredByte == MemSetI->getOperand(1) &&
2119
                   OR == OW_Complete;
2120
          }
2121
        }
2122
        return false;
2123
      };
2124

2125
      if (!IsRedundantStore() || isReadClobber(*MaybeDefLoc, DefInst))
2126
        continue;
2127
      LLVM_DEBUG(dbgs() << "DSE: Remove No-Op Store:\n  DEAD: " << *DefInst
2128
                        << '\n');
2129
      deleteDeadInstruction(DefInst);
2130
      NumRedundantStores++;
2131
      MadeChange = true;
2132
    }
2133
    return MadeChange;
2134
  }
2135
};
2136

2137
static bool eliminateDeadStores(Function &F, AliasAnalysis &AA, MemorySSA &MSSA,
2138
                                DominatorTree &DT, PostDominatorTree &PDT,
2139
                                const TargetLibraryInfo &TLI,
2140
                                const LoopInfo &LI) {
2141
  bool MadeChange = false;
2142

2143
  DSEState State(F, AA, MSSA, DT, PDT, TLI, LI);
2144
  // For each store:
2145
  for (unsigned I = 0; I < State.MemDefs.size(); I++) {
2146
    MemoryDef *KillingDef = State.MemDefs[I];
2147
    if (State.SkipStores.count(KillingDef))
2148
      continue;
2149
    Instruction *KillingI = KillingDef->getMemoryInst();
2150

2151
    std::optional<MemoryLocation> MaybeKillingLoc;
2152
    if (State.isMemTerminatorInst(KillingI)) {
2153
      if (auto KillingLoc = State.getLocForTerminator(KillingI))
2154
        MaybeKillingLoc = KillingLoc->first;
2155
    } else {
2156
      MaybeKillingLoc = State.getLocForWrite(KillingI);
2157
    }
2158

2159
    if (!MaybeKillingLoc) {
2160
      LLVM_DEBUG(dbgs() << "Failed to find analyzable write location for "
2161
                        << *KillingI << "\n");
2162
      continue;
2163
    }
2164
    MemoryLocation KillingLoc = *MaybeKillingLoc;
2165
    assert(KillingLoc.Ptr && "KillingLoc should not be null");
2166
    const Value *KillingUndObj = getUnderlyingObject(KillingLoc.Ptr);
2167
    LLVM_DEBUG(dbgs() << "Trying to eliminate MemoryDefs killed by "
2168
                      << *KillingDef << " (" << *KillingI << ")\n");
2169

2170
    unsigned ScanLimit = MemorySSAScanLimit;
2171
    unsigned WalkerStepLimit = MemorySSAUpwardsStepLimit;
2172
    unsigned PartialLimit = MemorySSAPartialStoreLimit;
2173
    // Worklist of MemoryAccesses that may be killed by KillingDef.
2174
    SmallSetVector<MemoryAccess *, 8> ToCheck;
2175
    // Track MemoryAccesses that have been deleted in the loop below, so we can
2176
    // skip them. Don't use SkipStores for this, which may contain reused
2177
    // MemoryAccess addresses.
2178
    SmallPtrSet<MemoryAccess *, 8> Deleted;
2179
    [[maybe_unused]] unsigned OrigNumSkipStores = State.SkipStores.size();
2180
    ToCheck.insert(KillingDef->getDefiningAccess());
2181

2182
    bool Shortend = false;
2183
    bool IsMemTerm = State.isMemTerminatorInst(KillingI);
2184
    // Check if MemoryAccesses in the worklist are killed by KillingDef.
2185
    for (unsigned I = 0; I < ToCheck.size(); I++) {
2186
      MemoryAccess *Current = ToCheck[I];
2187
      if (Deleted.contains(Current))
2188
        continue;
2189

2190
      std::optional<MemoryAccess *> MaybeDeadAccess = State.getDomMemoryDef(
2191
          KillingDef, Current, KillingLoc, KillingUndObj, ScanLimit,
2192
          WalkerStepLimit, IsMemTerm, PartialLimit);
2193

2194
      if (!MaybeDeadAccess) {
2195
        LLVM_DEBUG(dbgs() << "  finished walk\n");
2196
        continue;
2197
      }
2198

2199
      MemoryAccess *DeadAccess = *MaybeDeadAccess;
2200
      LLVM_DEBUG(dbgs() << " Checking if we can kill " << *DeadAccess);
2201
      if (isa<MemoryPhi>(DeadAccess)) {
2202
        LLVM_DEBUG(dbgs() << "\n  ... adding incoming values to worklist\n");
2203
        for (Value *V : cast<MemoryPhi>(DeadAccess)->incoming_values()) {
2204
          MemoryAccess *IncomingAccess = cast<MemoryAccess>(V);
2205
          BasicBlock *IncomingBlock = IncomingAccess->getBlock();
2206
          BasicBlock *PhiBlock = DeadAccess->getBlock();
2207

2208
          // We only consider incoming MemoryAccesses that come before the
2209
          // MemoryPhi. Otherwise we could discover candidates that do not
2210
          // strictly dominate our starting def.
2211
          if (State.PostOrderNumbers[IncomingBlock] >
2212
              State.PostOrderNumbers[PhiBlock])
2213
            ToCheck.insert(IncomingAccess);
2214
        }
2215
        continue;
2216
      }
2217
      auto *DeadDefAccess = cast<MemoryDef>(DeadAccess);
2218
      Instruction *DeadI = DeadDefAccess->getMemoryInst();
2219
      LLVM_DEBUG(dbgs() << " (" << *DeadI << ")\n");
2220
      ToCheck.insert(DeadDefAccess->getDefiningAccess());
2221
      NumGetDomMemoryDefPassed++;
2222

2223
      if (!DebugCounter::shouldExecute(MemorySSACounter))
2224
        continue;
2225

2226
      MemoryLocation DeadLoc = *State.getLocForWrite(DeadI);
2227

2228
      if (IsMemTerm) {
2229
        const Value *DeadUndObj = getUnderlyingObject(DeadLoc.Ptr);
2230
        if (KillingUndObj != DeadUndObj)
2231
          continue;
2232
        LLVM_DEBUG(dbgs() << "DSE: Remove Dead Store:\n  DEAD: " << *DeadI
2233
                          << "\n  KILLER: " << *KillingI << '\n');
2234
        State.deleteDeadInstruction(DeadI, &Deleted);
2235
        ++NumFastStores;
2236
        MadeChange = true;
2237
      } else {
2238
        // Check if DeadI overwrites KillingI.
2239
        int64_t KillingOffset = 0;
2240
        int64_t DeadOffset = 0;
2241
        OverwriteResult OR = State.isOverwrite(
2242
            KillingI, DeadI, KillingLoc, DeadLoc, KillingOffset, DeadOffset);
2243
        if (OR == OW_MaybePartial) {
2244
          auto Iter = State.IOLs.insert(
2245
              std::make_pair<BasicBlock *, InstOverlapIntervalsTy>(
2246
                  DeadI->getParent(), InstOverlapIntervalsTy()));
2247
          auto &IOL = Iter.first->second;
2248
          OR = isPartialOverwrite(KillingLoc, DeadLoc, KillingOffset,
2249
                                  DeadOffset, DeadI, IOL);
2250
        }
2251

2252
        if (EnablePartialStoreMerging && OR == OW_PartialEarlierWithFullLater) {
2253
          auto *DeadSI = dyn_cast<StoreInst>(DeadI);
2254
          auto *KillingSI = dyn_cast<StoreInst>(KillingI);
2255
          // We are re-using tryToMergePartialOverlappingStores, which requires
2256
          // DeadSI to dominate KillingSI.
2257
          // TODO: implement tryToMergeParialOverlappingStores using MemorySSA.
2258
          if (DeadSI && KillingSI && DT.dominates(DeadSI, KillingSI)) {
2259
            if (Constant *Merged = tryToMergePartialOverlappingStores(
2260
                    KillingSI, DeadSI, KillingOffset, DeadOffset, State.DL,
2261
                    State.BatchAA, &DT)) {
2262

2263
              // Update stored value of earlier store to merged constant.
2264
              DeadSI->setOperand(0, Merged);
2265
              ++NumModifiedStores;
2266
              MadeChange = true;
2267

2268
              Shortend = true;
2269
              // Remove killing store and remove any outstanding overlap
2270
              // intervals for the updated store.
2271
              State.deleteDeadInstruction(KillingSI, &Deleted);
2272
              auto I = State.IOLs.find(DeadSI->getParent());
2273
              if (I != State.IOLs.end())
2274
                I->second.erase(DeadSI);
2275
              break;
2276
            }
2277
          }
2278
        }
2279

2280
        if (OR == OW_Complete) {
2281
          LLVM_DEBUG(dbgs() << "DSE: Remove Dead Store:\n  DEAD: " << *DeadI
2282
                            << "\n  KILLER: " << *KillingI << '\n');
2283
          State.deleteDeadInstruction(DeadI, &Deleted);
2284
          ++NumFastStores;
2285
          MadeChange = true;
2286
        }
2287
      }
2288
    }
2289

2290
    assert(State.SkipStores.size() - OrigNumSkipStores == Deleted.size() &&
2291
           "SkipStores and Deleted out of sync?");
2292

2293
    // Check if the store is a no-op.
2294
    if (!Shortend && State.storeIsNoop(KillingDef, KillingUndObj)) {
2295
      LLVM_DEBUG(dbgs() << "DSE: Remove No-Op Store:\n  DEAD: " << *KillingI
2296
                        << '\n');
2297
      State.deleteDeadInstruction(KillingI);
2298
      NumRedundantStores++;
2299
      MadeChange = true;
2300
      continue;
2301
    }
2302

2303
    // Can we form a calloc from a memset/malloc pair?
2304
    if (!Shortend && State.tryFoldIntoCalloc(KillingDef, KillingUndObj)) {
2305
      LLVM_DEBUG(dbgs() << "DSE: Remove memset after forming calloc:\n"
2306
                        << "  DEAD: " << *KillingI << '\n');
2307
      State.deleteDeadInstruction(KillingI);
2308
      MadeChange = true;
2309
      continue;
2310
    }
2311
  }
2312

2313
  if (EnablePartialOverwriteTracking)
2314
    for (auto &KV : State.IOLs)
2315
      MadeChange |= State.removePartiallyOverlappedStores(KV.second);
2316

2317
  MadeChange |= State.eliminateRedundantStoresOfExistingValues();
2318
  MadeChange |= State.eliminateDeadWritesAtEndOfFunction();
2319

2320
  while (!State.ToRemove.empty()) {
2321
    Instruction *DeadInst = State.ToRemove.pop_back_val();
2322
    DeadInst->eraseFromParent();
2323
  }
2324

2325
  return MadeChange;
2326
}
2327
} // end anonymous namespace
2328

2329
//===----------------------------------------------------------------------===//
2330
// DSE Pass
2331
//===----------------------------------------------------------------------===//
2332
PreservedAnalyses DSEPass::run(Function &F, FunctionAnalysisManager &AM) {
2333
  AliasAnalysis &AA = AM.getResult<AAManager>(F);
2334
  const TargetLibraryInfo &TLI = AM.getResult<TargetLibraryAnalysis>(F);
2335
  DominatorTree &DT = AM.getResult<DominatorTreeAnalysis>(F);
2336
  MemorySSA &MSSA = AM.getResult<MemorySSAAnalysis>(F).getMSSA();
2337
  PostDominatorTree &PDT = AM.getResult<PostDominatorTreeAnalysis>(F);
2338
  LoopInfo &LI = AM.getResult<LoopAnalysis>(F);
2339

2340
  bool Changed = eliminateDeadStores(F, AA, MSSA, DT, PDT, TLI, LI);
2341

2342
#ifdef LLVM_ENABLE_STATS
2343
  if (AreStatisticsEnabled())
2344
    for (auto &I : instructions(F))
2345
      NumRemainingStores += isa<StoreInst>(&I);
2346
#endif
2347

2348
  if (!Changed)
2349
    return PreservedAnalyses::all();
2350

2351
  PreservedAnalyses PA;
2352
  PA.preserveSet<CFGAnalyses>();
2353
  PA.preserve<MemorySSAAnalysis>();
2354
  PA.preserve<LoopAnalysis>();
2355
  return PA;
2356
}
2357

2358
Product

Resources

Company