CoCalc -- OpenMPOpt.cpp

GitHub Repository: freebsd/freebsd-src
Path: blob/main/contrib/llvm-project/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
³⁵²⁶⁶ views
1
//===-- IPO/OpenMPOpt.cpp - Collection of OpenMP specific optimizations ---===//
2
//
3
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4
// See https://llvm.org/LICENSE.txt for license information.
5
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6
//
7
//===----------------------------------------------------------------------===//
8
//
9
// OpenMP specific optimizations:
10
//
11
// - Deduplication of runtime calls, e.g., omp_get_thread_num.
12
// - Replacing globalized device memory with stack memory.
13
// - Replacing globalized device memory with shared memory.
14
// - Parallel region merging.
15
// - Transforming generic-mode device kernels to SPMD mode.
16
// - Specializing the state machine for generic-mode device kernels.
17
//
18
//===----------------------------------------------------------------------===//
19

20
#include "llvm/Transforms/IPO/OpenMPOpt.h"
21

22
#include "llvm/ADT/EnumeratedArray.h"
23
#include "llvm/ADT/PostOrderIterator.h"
24
#include "llvm/ADT/SetVector.h"
25
#include "llvm/ADT/SmallPtrSet.h"
26
#include "llvm/ADT/SmallVector.h"
27
#include "llvm/ADT/Statistic.h"
28
#include "llvm/ADT/StringExtras.h"
29
#include "llvm/ADT/StringRef.h"
30
#include "llvm/Analysis/CallGraph.h"
31
#include "llvm/Analysis/CallGraphSCCPass.h"
32
#include "llvm/Analysis/MemoryLocation.h"
33
#include "llvm/Analysis/OptimizationRemarkEmitter.h"
34
#include "llvm/Analysis/ValueTracking.h"
35
#include "llvm/Frontend/OpenMP/OMPConstants.h"
36
#include "llvm/Frontend/OpenMP/OMPDeviceConstants.h"
37
#include "llvm/Frontend/OpenMP/OMPIRBuilder.h"
38
#include "llvm/IR/Assumptions.h"
39
#include "llvm/IR/BasicBlock.h"
40
#include "llvm/IR/Constants.h"
41
#include "llvm/IR/DiagnosticInfo.h"
42
#include "llvm/IR/Dominators.h"
43
#include "llvm/IR/Function.h"
44
#include "llvm/IR/GlobalValue.h"
45
#include "llvm/IR/GlobalVariable.h"
46
#include "llvm/IR/InstrTypes.h"
47
#include "llvm/IR/Instruction.h"
48
#include "llvm/IR/Instructions.h"
49
#include "llvm/IR/IntrinsicInst.h"
50
#include "llvm/IR/IntrinsicsAMDGPU.h"
51
#include "llvm/IR/IntrinsicsNVPTX.h"
52
#include "llvm/IR/LLVMContext.h"
53
#include "llvm/Support/Casting.h"
54
#include "llvm/Support/CommandLine.h"
55
#include "llvm/Support/Debug.h"
56
#include "llvm/Transforms/IPO/Attributor.h"
57
#include "llvm/Transforms/Utils/BasicBlockUtils.h"
58
#include "llvm/Transforms/Utils/CallGraphUpdater.h"
59

60
#include <algorithm>
61
#include <optional>
62
#include <string>
63

64
using namespace llvm;
65
using namespace omp;
66

67
#define DEBUG_TYPE "openmp-opt"
68

69
static cl::opt<bool> DisableOpenMPOptimizations(
70
    "openmp-opt-disable", cl::desc("Disable OpenMP specific optimizations."),
71
    cl::Hidden, cl::init(false));
72

73
static cl::opt<bool> EnableParallelRegionMerging(
74
    "openmp-opt-enable-merging",
75
    cl::desc("Enable the OpenMP region merging optimization."), cl::Hidden,
76
    cl::init(false));
77

78
static cl::opt<bool>
79
    DisableInternalization("openmp-opt-disable-internalization",
80
                           cl::desc("Disable function internalization."),
81
                           cl::Hidden, cl::init(false));
82

83
static cl::opt<bool> DeduceICVValues("openmp-deduce-icv-values",
84
                                     cl::init(false), cl::Hidden);
85
static cl::opt<bool> PrintICVValues("openmp-print-icv-values", cl::init(false),
86
                                    cl::Hidden);
87
static cl::opt<bool> PrintOpenMPKernels("openmp-print-gpu-kernels",
88
                                        cl::init(false), cl::Hidden);
89

90
static cl::opt<bool> HideMemoryTransferLatency(
91
    "openmp-hide-memory-transfer-latency",
92
    cl::desc("[WIP] Tries to hide the latency of host to device memory"
93
             " transfers"),
94
    cl::Hidden, cl::init(false));
95

96
static cl::opt<bool> DisableOpenMPOptDeglobalization(
97
    "openmp-opt-disable-deglobalization",
98
    cl::desc("Disable OpenMP optimizations involving deglobalization."),
99
    cl::Hidden, cl::init(false));
100

101
static cl::opt<bool> DisableOpenMPOptSPMDization(
102
    "openmp-opt-disable-spmdization",
103
    cl::desc("Disable OpenMP optimizations involving SPMD-ization."),
104
    cl::Hidden, cl::init(false));
105

106
static cl::opt<bool> DisableOpenMPOptFolding(
107
    "openmp-opt-disable-folding",
108
    cl::desc("Disable OpenMP optimizations involving folding."), cl::Hidden,
109
    cl::init(false));
110

111
static cl::opt<bool> DisableOpenMPOptStateMachineRewrite(
112
    "openmp-opt-disable-state-machine-rewrite",
113
    cl::desc("Disable OpenMP optimizations that replace the state machine."),
114
    cl::Hidden, cl::init(false));
115

116
static cl::opt<bool> DisableOpenMPOptBarrierElimination(
117
    "openmp-opt-disable-barrier-elimination",
118
    cl::desc("Disable OpenMP optimizations that eliminate barriers."),
119
    cl::Hidden, cl::init(false));
120

121
static cl::opt<bool> PrintModuleAfterOptimizations(
122
    "openmp-opt-print-module-after",
123
    cl::desc("Print the current module after OpenMP optimizations."),
124
    cl::Hidden, cl::init(false));
125

126
static cl::opt<bool> PrintModuleBeforeOptimizations(
127
    "openmp-opt-print-module-before",
128
    cl::desc("Print the current module before OpenMP optimizations."),
129
    cl::Hidden, cl::init(false));
130

131
static cl::opt<bool> AlwaysInlineDeviceFunctions(
132
    "openmp-opt-inline-device",
133
    cl::desc("Inline all applicible functions on the device."), cl::Hidden,
134
    cl::init(false));
135

136
static cl::opt<bool>
137
    EnableVerboseRemarks("openmp-opt-verbose-remarks",
138
                         cl::desc("Enables more verbose remarks."), cl::Hidden,
139
                         cl::init(false));
140

141
static cl::opt<unsigned>
142
    SetFixpointIterations("openmp-opt-max-iterations", cl::Hidden,
143
                          cl::desc("Maximal number of attributor iterations."),
144
                          cl::init(256));
145

146
static cl::opt<unsigned>
147
    SharedMemoryLimit("openmp-opt-shared-limit", cl::Hidden,
148
                      cl::desc("Maximum amount of shared memory to use."),
149
                      cl::init(std::numeric_limits<unsigned>::max()));
150

151
STATISTIC(NumOpenMPRuntimeCallsDeduplicated,
152
          "Number of OpenMP runtime calls deduplicated");
153
STATISTIC(NumOpenMPParallelRegionsDeleted,
154
          "Number of OpenMP parallel regions deleted");
155
STATISTIC(NumOpenMPRuntimeFunctionsIdentified,
156
          "Number of OpenMP runtime functions identified");
157
STATISTIC(NumOpenMPRuntimeFunctionUsesIdentified,
158
          "Number of OpenMP runtime function uses identified");
159
STATISTIC(NumOpenMPTargetRegionKernels,
160
          "Number of OpenMP target region entry points (=kernels) identified");
161
STATISTIC(NumNonOpenMPTargetRegionKernels,
162
          "Number of non-OpenMP target region kernels identified");
163
STATISTIC(NumOpenMPTargetRegionKernelsSPMD,
164
          "Number of OpenMP target region entry points (=kernels) executed in "
165
          "SPMD-mode instead of generic-mode");
166
STATISTIC(NumOpenMPTargetRegionKernelsWithoutStateMachine,
167
          "Number of OpenMP target region entry points (=kernels) executed in "
168
          "generic-mode without a state machines");
169
STATISTIC(NumOpenMPTargetRegionKernelsCustomStateMachineWithFallback,
170
          "Number of OpenMP target region entry points (=kernels) executed in "
171
          "generic-mode with customized state machines with fallback");
172
STATISTIC(NumOpenMPTargetRegionKernelsCustomStateMachineWithoutFallback,
173
          "Number of OpenMP target region entry points (=kernels) executed in "
174
          "generic-mode with customized state machines without fallback");
175
STATISTIC(
176
    NumOpenMPParallelRegionsReplacedInGPUStateMachine,
177
    "Number of OpenMP parallel regions replaced with ID in GPU state machines");
178
STATISTIC(NumOpenMPParallelRegionsMerged,
179
          "Number of OpenMP parallel regions merged");
180
STATISTIC(NumBytesMovedToSharedMemory,
181
          "Amount of memory pushed to shared memory");
182
STATISTIC(NumBarriersEliminated, "Number of redundant barriers eliminated");
183

184
#if !defined(NDEBUG)
185
static constexpr auto TAG = "[" DEBUG_TYPE "]";
186
#endif
187

188
namespace KernelInfo {
189

190
// struct ConfigurationEnvironmentTy {
191
//   uint8_t UseGenericStateMachine;
192
//   uint8_t MayUseNestedParallelism;
193
//   llvm::omp::OMPTgtExecModeFlags ExecMode;
194
//   int32_t MinThreads;
195
//   int32_t MaxThreads;
196
//   int32_t MinTeams;
197
//   int32_t MaxTeams;
198
// };
199

200
// struct DynamicEnvironmentTy {
201
//   uint16_t DebugIndentionLevel;
202
// };
203

204
// struct KernelEnvironmentTy {
205
//   ConfigurationEnvironmentTy Configuration;
206
//   IdentTy *Ident;
207
//   DynamicEnvironmentTy *DynamicEnv;
208
// };
209

210
#define KERNEL_ENVIRONMENT_IDX(MEMBER, IDX)                                    \
211
  constexpr const unsigned MEMBER##Idx = IDX;
212

213
KERNEL_ENVIRONMENT_IDX(Configuration, 0)
214
KERNEL_ENVIRONMENT_IDX(Ident, 1)
215

216
#undef KERNEL_ENVIRONMENT_IDX
217

218
#define KERNEL_ENVIRONMENT_CONFIGURATION_IDX(MEMBER, IDX)                      \
219
  constexpr const unsigned MEMBER##Idx = IDX;
220

221
KERNEL_ENVIRONMENT_CONFIGURATION_IDX(UseGenericStateMachine, 0)
222
KERNEL_ENVIRONMENT_CONFIGURATION_IDX(MayUseNestedParallelism, 1)
223
KERNEL_ENVIRONMENT_CONFIGURATION_IDX(ExecMode, 2)
224
KERNEL_ENVIRONMENT_CONFIGURATION_IDX(MinThreads, 3)
225
KERNEL_ENVIRONMENT_CONFIGURATION_IDX(MaxThreads, 4)
226
KERNEL_ENVIRONMENT_CONFIGURATION_IDX(MinTeams, 5)
227
KERNEL_ENVIRONMENT_CONFIGURATION_IDX(MaxTeams, 6)
228

229
#undef KERNEL_ENVIRONMENT_CONFIGURATION_IDX
230

231
#define KERNEL_ENVIRONMENT_GETTER(MEMBER, RETURNTYPE)                          \
232
  RETURNTYPE *get##MEMBER##FromKernelEnvironment(ConstantStruct *KernelEnvC) { \
233
    return cast<RETURNTYPE>(KernelEnvC->getAggregateElement(MEMBER##Idx));     \
234
  }
235

236
KERNEL_ENVIRONMENT_GETTER(Ident, Constant)
237
KERNEL_ENVIRONMENT_GETTER(Configuration, ConstantStruct)
238

239
#undef KERNEL_ENVIRONMENT_GETTER
240

241
#define KERNEL_ENVIRONMENT_CONFIGURATION_GETTER(MEMBER)                        \
242
  ConstantInt *get##MEMBER##FromKernelEnvironment(                             \
243
      ConstantStruct *KernelEnvC) {                                            \
244
    ConstantStruct *ConfigC =                                                  \
245
        getConfigurationFromKernelEnvironment(KernelEnvC);                     \
246
    return dyn_cast<ConstantInt>(ConfigC->getAggregateElement(MEMBER##Idx));   \
247
  }
248

249
KERNEL_ENVIRONMENT_CONFIGURATION_GETTER(UseGenericStateMachine)
250
KERNEL_ENVIRONMENT_CONFIGURATION_GETTER(MayUseNestedParallelism)
251
KERNEL_ENVIRONMENT_CONFIGURATION_GETTER(ExecMode)
252
KERNEL_ENVIRONMENT_CONFIGURATION_GETTER(MinThreads)
253
KERNEL_ENVIRONMENT_CONFIGURATION_GETTER(MaxThreads)
254
KERNEL_ENVIRONMENT_CONFIGURATION_GETTER(MinTeams)
255
KERNEL_ENVIRONMENT_CONFIGURATION_GETTER(MaxTeams)
256

257
#undef KERNEL_ENVIRONMENT_CONFIGURATION_GETTER
258

259
GlobalVariable *
260
getKernelEnvironementGVFromKernelInitCB(CallBase *KernelInitCB) {
261
  constexpr const int InitKernelEnvironmentArgNo = 0;
262
  return cast<GlobalVariable>(
263
      KernelInitCB->getArgOperand(InitKernelEnvironmentArgNo)
264
          ->stripPointerCasts());
265
}
266

267
ConstantStruct *getKernelEnvironementFromKernelInitCB(CallBase *KernelInitCB) {
268
  GlobalVariable *KernelEnvGV =
269
      getKernelEnvironementGVFromKernelInitCB(KernelInitCB);
270
  return cast<ConstantStruct>(KernelEnvGV->getInitializer());
271
}
272
} // namespace KernelInfo
273

274
namespace {
275

276
struct AAHeapToShared;
277

278
struct AAICVTracker;
279

280
/// OpenMP specific information. For now, stores RFIs and ICVs also needed for
281
/// Attributor runs.
282
struct OMPInformationCache : public InformationCache {
283
  OMPInformationCache(Module &M, AnalysisGetter &AG,
284
                      BumpPtrAllocator &Allocator, SetVector<Function *> *CGSCC,
285
                      bool OpenMPPostLink)
286
      : InformationCache(M, AG, Allocator, CGSCC), OMPBuilder(M),
287
        OpenMPPostLink(OpenMPPostLink) {
288

289
    OMPBuilder.Config.IsTargetDevice = isOpenMPDevice(OMPBuilder.M);
290
    OMPBuilder.initialize();
291
    initializeRuntimeFunctions(M);
292
    initializeInternalControlVars();
293
  }
294

295
  /// Generic information that describes an internal control variable.
296
  struct InternalControlVarInfo {
297
    /// The kind, as described by InternalControlVar enum.
298
    InternalControlVar Kind;
299

300
    /// The name of the ICV.
301
    StringRef Name;
302

303
    /// Environment variable associated with this ICV.
304
    StringRef EnvVarName;
305

306
    /// Initial value kind.
307
    ICVInitValue InitKind;
308

309
    /// Initial value.
310
    ConstantInt *InitValue;
311

312
    /// Setter RTL function associated with this ICV.
313
    RuntimeFunction Setter;
314

315
    /// Getter RTL function associated with this ICV.
316
    RuntimeFunction Getter;
317

318
    /// RTL Function corresponding to the override clause of this ICV
319
    RuntimeFunction Clause;
320
  };
321

322
  /// Generic information that describes a runtime function
323
  struct RuntimeFunctionInfo {
324

325
    /// The kind, as described by the RuntimeFunction enum.
326
    RuntimeFunction Kind;
327

328
    /// The name of the function.
329
    StringRef Name;
330

331
    /// Flag to indicate a variadic function.
332
    bool IsVarArg;
333

334
    /// The return type of the function.
335
    Type *ReturnType;
336

337
    /// The argument types of the function.
338
    SmallVector<Type *, 8> ArgumentTypes;
339

340
    /// The declaration if available.
341
    Function *Declaration = nullptr;
342

343
    /// Uses of this runtime function per function containing the use.
344
    using UseVector = SmallVector<Use *, 16>;
345

346
    /// Clear UsesMap for runtime function.
347
    void clearUsesMap() { UsesMap.clear(); }
348

349
    /// Boolean conversion that is true if the runtime function was found.
350
    operator bool() const { return Declaration; }
351

352
    /// Return the vector of uses in function \p F.
353
    UseVector &getOrCreateUseVector(Function *F) {
354
      std::shared_ptr<UseVector> &UV = UsesMap[F];
355
      if (!UV)
356
        UV = std::make_shared<UseVector>();
357
      return *UV;
358
    }
359

360
    /// Return the vector of uses in function \p F or `nullptr` if there are
361
    /// none.
362
    const UseVector *getUseVector(Function &F) const {
363
      auto I = UsesMap.find(&F);
364
      if (I != UsesMap.end())
365
        return I->second.get();
366
      return nullptr;
367
    }
368

369
    /// Return how many functions contain uses of this runtime function.
370
    size_t getNumFunctionsWithUses() const { return UsesMap.size(); }
371

372
    /// Return the number of arguments (or the minimal number for variadic
373
    /// functions).
374
    size_t getNumArgs() const { return ArgumentTypes.size(); }
375

376
    /// Run the callback \p CB on each use and forget the use if the result is
377
    /// true. The callback will be fed the function in which the use was
378
    /// encountered as second argument.
379
    void foreachUse(SmallVectorImpl<Function *> &SCC,
380
                    function_ref<bool(Use &, Function &)> CB) {
381
      for (Function *F : SCC)
382
        foreachUse(CB, F);
383
    }
384

385
    /// Run the callback \p CB on each use within the function \p F and forget
386
    /// the use if the result is true.
387
    void foreachUse(function_ref<bool(Use &, Function &)> CB, Function *F) {
388
      SmallVector<unsigned, 8> ToBeDeleted;
389
      ToBeDeleted.clear();
390

391
      unsigned Idx = 0;
392
      UseVector &UV = getOrCreateUseVector(F);
393

394
      for (Use *U : UV) {
395
        if (CB(*U, *F))
396
          ToBeDeleted.push_back(Idx);
397
        ++Idx;
398
      }
399

400
      // Remove the to-be-deleted indices in reverse order as prior
401
      // modifications will not modify the smaller indices.
402
      while (!ToBeDeleted.empty()) {
403
        unsigned Idx = ToBeDeleted.pop_back_val();
404
        UV[Idx] = UV.back();
405
        UV.pop_back();
406
      }
407
    }
408

409
  private:
410
    /// Map from functions to all uses of this runtime function contained in
411
    /// them.
412
    DenseMap<Function *, std::shared_ptr<UseVector>> UsesMap;
413

414
  public:
415
    /// Iterators for the uses of this runtime function.
416
    decltype(UsesMap)::iterator begin() { return UsesMap.begin(); }
417
    decltype(UsesMap)::iterator end() { return UsesMap.end(); }
418
  };
419

420
  /// An OpenMP-IR-Builder instance
421
  OpenMPIRBuilder OMPBuilder;
422

423
  /// Map from runtime function kind to the runtime function description.
424
  EnumeratedArray<RuntimeFunctionInfo, RuntimeFunction,
425
                  RuntimeFunction::OMPRTL___last>
426
      RFIs;
427

428
  /// Map from function declarations/definitions to their runtime enum type.
429
  DenseMap<Function *, RuntimeFunction> RuntimeFunctionIDMap;
430

431
  /// Map from ICV kind to the ICV description.
432
  EnumeratedArray<InternalControlVarInfo, InternalControlVar,
433
                  InternalControlVar::ICV___last>
434
      ICVs;
435

436
  /// Helper to initialize all internal control variable information for those
437
  /// defined in OMPKinds.def.
438
  void initializeInternalControlVars() {
439
#define ICV_RT_SET(_Name, RTL)                                                 \
440
  {                                                                            \
441
    auto &ICV = ICVs[_Name];                                                   \
442
    ICV.Setter = RTL;                                                          \
443
  }
444
#define ICV_RT_GET(Name, RTL)                                                  \
445
  {                                                                            \
446
    auto &ICV = ICVs[Name];                                                    \
447
    ICV.Getter = RTL;                                                          \
448
  }
449
#define ICV_DATA_ENV(Enum, _Name, _EnvVarName, Init)                           \
450
  {                                                                            \
451
    auto &ICV = ICVs[Enum];                                                    \
452
    ICV.Name = _Name;                                                          \
453
    ICV.Kind = Enum;                                                           \
454
    ICV.InitKind = Init;                                                       \
455
    ICV.EnvVarName = _EnvVarName;                                              \
456
    switch (ICV.InitKind) {                                                    \
457
    case ICV_IMPLEMENTATION_DEFINED:                                           \
458
      ICV.InitValue = nullptr;                                                 \
459
      break;                                                                   \
460
    case ICV_ZERO:                                                             \
461
      ICV.InitValue = ConstantInt::get(                                        \
462
          Type::getInt32Ty(OMPBuilder.Int32->getContext()), 0);                \
463
      break;                                                                   \
464
    case ICV_FALSE:                                                            \
465
      ICV.InitValue = ConstantInt::getFalse(OMPBuilder.Int1->getContext());    \
466
      break;                                                                   \
467
    case ICV_LAST:                                                             \
468
      break;                                                                   \
469
    }                                                                          \
470
  }
471
#include "llvm/Frontend/OpenMP/OMPKinds.def"
472
  }
473

474
  /// Returns true if the function declaration \p F matches the runtime
475
  /// function types, that is, return type \p RTFRetType, and argument types
476
  /// \p RTFArgTypes.
477
  static bool declMatchesRTFTypes(Function *F, Type *RTFRetType,
478
                                  SmallVector<Type *, 8> &RTFArgTypes) {
479
    // TODO: We should output information to the user (under debug output
480
    //       and via remarks).
481

482
    if (!F)
483
      return false;
484
    if (F->getReturnType() != RTFRetType)
485
      return false;
486
    if (F->arg_size() != RTFArgTypes.size())
487
      return false;
488

489
    auto *RTFTyIt = RTFArgTypes.begin();
490
    for (Argument &Arg : F->args()) {
491
      if (Arg.getType() != *RTFTyIt)
492
        return false;
493

494
      ++RTFTyIt;
495
    }
496

497
    return true;
498
  }
499

500
  // Helper to collect all uses of the declaration in the UsesMap.
501
  unsigned collectUses(RuntimeFunctionInfo &RFI, bool CollectStats = true) {
502
    unsigned NumUses = 0;
503
    if (!RFI.Declaration)
504
      return NumUses;
505
    OMPBuilder.addAttributes(RFI.Kind, *RFI.Declaration);
506

507
    if (CollectStats) {
508
      NumOpenMPRuntimeFunctionsIdentified += 1;
509
      NumOpenMPRuntimeFunctionUsesIdentified += RFI.Declaration->getNumUses();
510
    }
511

512
    // TODO: We directly convert uses into proper calls and unknown uses.
513
    for (Use &U : RFI.Declaration->uses()) {
514
      if (Instruction *UserI = dyn_cast<Instruction>(U.getUser())) {
515
        if (!CGSCC || CGSCC->empty() || CGSCC->contains(UserI->getFunction())) {
516
          RFI.getOrCreateUseVector(UserI->getFunction()).push_back(&U);
517
          ++NumUses;
518
        }
519
      } else {
520
        RFI.getOrCreateUseVector(nullptr).push_back(&U);
521
        ++NumUses;
522
      }
523
    }
524
    return NumUses;
525
  }
526

527
  // Helper function to recollect uses of a runtime function.
528
  void recollectUsesForFunction(RuntimeFunction RTF) {
529
    auto &RFI = RFIs[RTF];
530
    RFI.clearUsesMap();
531
    collectUses(RFI, /*CollectStats*/ false);
532
  }
533

534
  // Helper function to recollect uses of all runtime functions.
535
  void recollectUses() {
536
    for (int Idx = 0; Idx < RFIs.size(); ++Idx)
537
      recollectUsesForFunction(static_cast<RuntimeFunction>(Idx));
538
  }
539

540
  // Helper function to inherit the calling convention of the function callee.
541
  void setCallingConvention(FunctionCallee Callee, CallInst *CI) {
542
    if (Function *Fn = dyn_cast<Function>(Callee.getCallee()))
543
      CI->setCallingConv(Fn->getCallingConv());
544
  }
545

546
  // Helper function to determine if it's legal to create a call to the runtime
547
  // functions.
548
  bool runtimeFnsAvailable(ArrayRef<RuntimeFunction> Fns) {
549
    // We can always emit calls if we haven't yet linked in the runtime.
550
    if (!OpenMPPostLink)
551
      return true;
552

553
    // Once the runtime has been already been linked in we cannot emit calls to
554
    // any undefined functions.
555
    for (RuntimeFunction Fn : Fns) {
556
      RuntimeFunctionInfo &RFI = RFIs[Fn];
557

558
      if (RFI.Declaration && RFI.Declaration->isDeclaration())
559
        return false;
560
    }
561
    return true;
562
  }
563

564
  /// Helper to initialize all runtime function information for those defined
565
  /// in OpenMPKinds.def.
566
  void initializeRuntimeFunctions(Module &M) {
567

568
    // Helper macros for handling __VA_ARGS__ in OMP_RTL
569
#define OMP_TYPE(VarName, ...)                                                 \
570
  Type *VarName = OMPBuilder.VarName;                                          \
571
  (void)VarName;
572

573
#define OMP_ARRAY_TYPE(VarName, ...)                                           \
574
  ArrayType *VarName##Ty = OMPBuilder.VarName##Ty;                             \
575
  (void)VarName##Ty;                                                           \
576
  PointerType *VarName##PtrTy = OMPBuilder.VarName##PtrTy;                     \
577
  (void)VarName##PtrTy;
578

579
#define OMP_FUNCTION_TYPE(VarName, ...)                                        \
580
  FunctionType *VarName = OMPBuilder.VarName;                                  \
581
  (void)VarName;                                                               \
582
  PointerType *VarName##Ptr = OMPBuilder.VarName##Ptr;                         \
583
  (void)VarName##Ptr;
584

585
#define OMP_STRUCT_TYPE(VarName, ...)                                          \
586
  StructType *VarName = OMPBuilder.VarName;                                    \
587
  (void)VarName;                                                               \
588
  PointerType *VarName##Ptr = OMPBuilder.VarName##Ptr;                         \
589
  (void)VarName##Ptr;
590

591
#define OMP_RTL(_Enum, _Name, _IsVarArg, _ReturnType, ...)                     \
592
  {                                                                            \
593
    SmallVector<Type *, 8> ArgsTypes({__VA_ARGS__});                           \
594
    Function *F = M.getFunction(_Name);                                        \
595
    RTLFunctions.insert(F);                                                    \
596
    if (declMatchesRTFTypes(F, OMPBuilder._ReturnType, ArgsTypes)) {           \
597
      RuntimeFunctionIDMap[F] = _Enum;                                         \
598
      auto &RFI = RFIs[_Enum];                                                 \
599
      RFI.Kind = _Enum;                                                        \
600
      RFI.Name = _Name;                                                        \
601
      RFI.IsVarArg = _IsVarArg;                                                \
602
      RFI.ReturnType = OMPBuilder._ReturnType;                                 \
603
      RFI.ArgumentTypes = std::move(ArgsTypes);                                \
604
      RFI.Declaration = F;                                                     \
605
      unsigned NumUses = collectUses(RFI);                                     \
606
      (void)NumUses;                                                           \
607
      LLVM_DEBUG({                                                             \
608
        dbgs() << TAG << RFI.Name << (RFI.Declaration ? "" : " not")           \
609
               << " found\n";                                                  \
610
        if (RFI.Declaration)                                                   \
611
          dbgs() << TAG << "-> got " << NumUses << " uses in "                 \
612
                 << RFI.getNumFunctionsWithUses()                              \
613
                 << " different functions.\n";                                 \
614
      });                                                                      \
615
    }                                                                          \
616
  }
617
#include "llvm/Frontend/OpenMP/OMPKinds.def"
618

619
    // Remove the `noinline` attribute from `__kmpc`, `ompx::` and `omp_`
620
    // functions, except if `optnone` is present.
621
    if (isOpenMPDevice(M)) {
622
      for (Function &F : M) {
623
        for (StringRef Prefix : {"__kmpc", "_ZN4ompx", "omp_"})
624
          if (F.hasFnAttribute(Attribute::NoInline) &&
625
              F.getName().starts_with(Prefix) &&
626
              !F.hasFnAttribute(Attribute::OptimizeNone))
627
            F.removeFnAttr(Attribute::NoInline);
628
      }
629
    }
630

631
    // TODO: We should attach the attributes defined in OMPKinds.def.
632
  }
633

634
  /// Collection of known OpenMP runtime functions..
635
  DenseSet<const Function *> RTLFunctions;
636

637
  /// Indicates if we have already linked in the OpenMP device library.
638
  bool OpenMPPostLink = false;
639
};
640

641
template <typename Ty, bool InsertInvalidates = true>
642
struct BooleanStateWithSetVector : public BooleanState {
643
  bool contains(const Ty &Elem) const { return Set.contains(Elem); }
644
  bool insert(const Ty &Elem) {
645
    if (InsertInvalidates)
646
      BooleanState::indicatePessimisticFixpoint();
647
    return Set.insert(Elem);
648
  }
649

650
  const Ty &operator[](int Idx) const { return Set[Idx]; }
651
  bool operator==(const BooleanStateWithSetVector &RHS) const {
652
    return BooleanState::operator==(RHS) && Set == RHS.Set;
653
  }
654
  bool operator!=(const BooleanStateWithSetVector &RHS) const {
655
    return !(*this == RHS);
656
  }
657

658
  bool empty() const { return Set.empty(); }
659
  size_t size() const { return Set.size(); }
660

661
  /// "Clamp" this state with \p RHS.
662
  BooleanStateWithSetVector &operator^=(const BooleanStateWithSetVector &RHS) {
663
    BooleanState::operator^=(RHS);
664
    Set.insert(RHS.Set.begin(), RHS.Set.end());
665
    return *this;
666
  }
667

668
private:
669
  /// A set to keep track of elements.
670
  SetVector<Ty> Set;
671

672
public:
673
  typename decltype(Set)::iterator begin() { return Set.begin(); }
674
  typename decltype(Set)::iterator end() { return Set.end(); }
675
  typename decltype(Set)::const_iterator begin() const { return Set.begin(); }
676
  typename decltype(Set)::const_iterator end() const { return Set.end(); }
677
};
678

679
template <typename Ty, bool InsertInvalidates = true>
680
using BooleanStateWithPtrSetVector =
681
    BooleanStateWithSetVector<Ty *, InsertInvalidates>;
682

683
struct KernelInfoState : AbstractState {
684
  /// Flag to track if we reached a fixpoint.
685
  bool IsAtFixpoint = false;
686

687
  /// The parallel regions (identified by the outlined parallel functions) that
688
  /// can be reached from the associated function.
689
  BooleanStateWithPtrSetVector<CallBase, /* InsertInvalidates */ false>
690
      ReachedKnownParallelRegions;
691

692
  /// State to track what parallel region we might reach.
693
  BooleanStateWithPtrSetVector<CallBase> ReachedUnknownParallelRegions;
694

695
  /// State to track if we are in SPMD-mode, assumed or know, and why we decided
696
  /// we cannot be. If it is assumed, then RequiresFullRuntime should also be
697
  /// false.
698
  BooleanStateWithPtrSetVector<Instruction, false> SPMDCompatibilityTracker;
699

700
  /// The __kmpc_target_init call in this kernel, if any. If we find more than
701
  /// one we abort as the kernel is malformed.
702
  CallBase *KernelInitCB = nullptr;
703

704
  /// The constant kernel environement as taken from and passed to
705
  /// __kmpc_target_init.
706
  ConstantStruct *KernelEnvC = nullptr;
707

708
  /// The __kmpc_target_deinit call in this kernel, if any. If we find more than
709
  /// one we abort as the kernel is malformed.
710
  CallBase *KernelDeinitCB = nullptr;
711

712
  /// Flag to indicate if the associated function is a kernel entry.
713
  bool IsKernelEntry = false;
714

715
  /// State to track what kernel entries can reach the associated function.
716
  BooleanStateWithPtrSetVector<Function, false> ReachingKernelEntries;
717

718
  /// State to indicate if we can track parallel level of the associated
719
  /// function. We will give up tracking if we encounter unknown caller or the
720
  /// caller is __kmpc_parallel_51.
721
  BooleanStateWithSetVector<uint8_t> ParallelLevels;
722

723
  /// Flag that indicates if the kernel has nested Parallelism
724
  bool NestedParallelism = false;
725

726
  /// Abstract State interface
727
  ///{
728

729
  KernelInfoState() = default;
730
  KernelInfoState(bool BestState) {
731
    if (!BestState)
732
      indicatePessimisticFixpoint();
733
  }
734

735
  /// See AbstractState::isValidState(...)
736
  bool isValidState() const override { return true; }
737

738
  /// See AbstractState::isAtFixpoint(...)
739
  bool isAtFixpoint() const override { return IsAtFixpoint; }
740

741
  /// See AbstractState::indicatePessimisticFixpoint(...)
742
  ChangeStatus indicatePessimisticFixpoint() override {
743
    IsAtFixpoint = true;
744
    ParallelLevels.indicatePessimisticFixpoint();
745
    ReachingKernelEntries.indicatePessimisticFixpoint();
746
    SPMDCompatibilityTracker.indicatePessimisticFixpoint();
747
    ReachedKnownParallelRegions.indicatePessimisticFixpoint();
748
    ReachedUnknownParallelRegions.indicatePessimisticFixpoint();
749
    NestedParallelism = true;
750
    return ChangeStatus::CHANGED;
751
  }
752

753
  /// See AbstractState::indicateOptimisticFixpoint(...)
754
  ChangeStatus indicateOptimisticFixpoint() override {
755
    IsAtFixpoint = true;
756
    ParallelLevels.indicateOptimisticFixpoint();
757
    ReachingKernelEntries.indicateOptimisticFixpoint();
758
    SPMDCompatibilityTracker.indicateOptimisticFixpoint();
759
    ReachedKnownParallelRegions.indicateOptimisticFixpoint();
760
    ReachedUnknownParallelRegions.indicateOptimisticFixpoint();
761
    return ChangeStatus::UNCHANGED;
762
  }
763

764
  /// Return the assumed state
765
  KernelInfoState &getAssumed() { return *this; }
766
  const KernelInfoState &getAssumed() const { return *this; }
767

768
  bool operator==(const KernelInfoState &RHS) const {
769
    if (SPMDCompatibilityTracker != RHS.SPMDCompatibilityTracker)
770
      return false;
771
    if (ReachedKnownParallelRegions != RHS.ReachedKnownParallelRegions)
772
      return false;
773
    if (ReachedUnknownParallelRegions != RHS.ReachedUnknownParallelRegions)
774
      return false;
775
    if (ReachingKernelEntries != RHS.ReachingKernelEntries)
776
      return false;
777
    if (ParallelLevels != RHS.ParallelLevels)
778
      return false;
779
    if (NestedParallelism != RHS.NestedParallelism)
780
      return false;
781
    return true;
782
  }
783

784
  /// Returns true if this kernel contains any OpenMP parallel regions.
785
  bool mayContainParallelRegion() {
786
    return !ReachedKnownParallelRegions.empty() ||
787
           !ReachedUnknownParallelRegions.empty();
788
  }
789

790
  /// Return empty set as the best state of potential values.
791
  static KernelInfoState getBestState() { return KernelInfoState(true); }
792

793
  static KernelInfoState getBestState(KernelInfoState &KIS) {
794
    return getBestState();
795
  }
796

797
  /// Return full set as the worst state of potential values.
798
  static KernelInfoState getWorstState() { return KernelInfoState(false); }
799

800
  /// "Clamp" this state with \p KIS.
801
  KernelInfoState operator^=(const KernelInfoState &KIS) {
802
    // Do not merge two different _init and _deinit call sites.
803
    if (KIS.KernelInitCB) {
804
      if (KernelInitCB && KernelInitCB != KIS.KernelInitCB)
805
        llvm_unreachable("Kernel that calls another kernel violates OpenMP-Opt "
806
                         "assumptions.");
807
      KernelInitCB = KIS.KernelInitCB;
808
    }
809
    if (KIS.KernelDeinitCB) {
810
      if (KernelDeinitCB && KernelDeinitCB != KIS.KernelDeinitCB)
811
        llvm_unreachable("Kernel that calls another kernel violates OpenMP-Opt "
812
                         "assumptions.");
813
      KernelDeinitCB = KIS.KernelDeinitCB;
814
    }
815
    if (KIS.KernelEnvC) {
816
      if (KernelEnvC && KernelEnvC != KIS.KernelEnvC)
817
        llvm_unreachable("Kernel that calls another kernel violates OpenMP-Opt "
818
                         "assumptions.");
819
      KernelEnvC = KIS.KernelEnvC;
820
    }
821
    SPMDCompatibilityTracker ^= KIS.SPMDCompatibilityTracker;
822
    ReachedKnownParallelRegions ^= KIS.ReachedKnownParallelRegions;
823
    ReachedUnknownParallelRegions ^= KIS.ReachedUnknownParallelRegions;
824
    NestedParallelism |= KIS.NestedParallelism;
825
    return *this;
826
  }
827

828
  KernelInfoState operator&=(const KernelInfoState &KIS) {
829
    return (*this ^= KIS);
830
  }
831

832
  ///}
833
};
834

835
/// Used to map the values physically (in the IR) stored in an offload
836
/// array, to a vector in memory.
837
struct OffloadArray {
838
  /// Physical array (in the IR).
839
  AllocaInst *Array = nullptr;
840
  /// Mapped values.
841
  SmallVector<Value *, 8> StoredValues;
842
  /// Last stores made in the offload array.
843
  SmallVector<StoreInst *, 8> LastAccesses;
844

845
  OffloadArray() = default;
846

847
  /// Initializes the OffloadArray with the values stored in \p Array before
848
  /// instruction \p Before is reached. Returns false if the initialization
849
  /// fails.
850
  /// This MUST be used immediately after the construction of the object.
851
  bool initialize(AllocaInst &Array, Instruction &Before) {
852
    if (!Array.getAllocatedType()->isArrayTy())
853
      return false;
854

855
    if (!getValues(Array, Before))
856
      return false;
857

858
    this->Array = &Array;
859
    return true;
860
  }
861

862
  static const unsigned DeviceIDArgNum = 1;
863
  static const unsigned BasePtrsArgNum = 3;
864
  static const unsigned PtrsArgNum = 4;
865
  static const unsigned SizesArgNum = 5;
866

867
private:
868
  /// Traverses the BasicBlock where \p Array is, collecting the stores made to
869
  /// \p Array, leaving StoredValues with the values stored before the
870
  /// instruction \p Before is reached.
871
  bool getValues(AllocaInst &Array, Instruction &Before) {
872
    // Initialize container.
873
    const uint64_t NumValues = Array.getAllocatedType()->getArrayNumElements();
874
    StoredValues.assign(NumValues, nullptr);
875
    LastAccesses.assign(NumValues, nullptr);
876

877
    // TODO: This assumes the instruction \p Before is in the same
878
    //  BasicBlock as Array. Make it general, for any control flow graph.
879
    BasicBlock *BB = Array.getParent();
880
    if (BB != Before.getParent())
881
      return false;
882

883
    const DataLayout &DL = Array.getDataLayout();
884
    const unsigned int PointerSize = DL.getPointerSize();
885

886
    for (Instruction &I : *BB) {
887
      if (&I == &Before)
888
        break;
889

890
      if (!isa<StoreInst>(&I))
891
        continue;
892

893
      auto *S = cast<StoreInst>(&I);
894
      int64_t Offset = -1;
895
      auto *Dst =
896
          GetPointerBaseWithConstantOffset(S->getPointerOperand(), Offset, DL);
897
      if (Dst == &Array) {
898
        int64_t Idx = Offset / PointerSize;
899
        StoredValues[Idx] = getUnderlyingObject(S->getValueOperand());
900
        LastAccesses[Idx] = S;
901
      }
902
    }
903

904
    return isFilled();
905
  }
906

907
  /// Returns true if all values in StoredValues and
908
  /// LastAccesses are not nullptrs.
909
  bool isFilled() {
910
    const unsigned NumValues = StoredValues.size();
911
    for (unsigned I = 0; I < NumValues; ++I) {
912
      if (!StoredValues[I] || !LastAccesses[I])
913
        return false;
914
    }
915

916
    return true;
917
  }
918
};
919

920
struct OpenMPOpt {
921

922
  using OptimizationRemarkGetter =
923
      function_ref<OptimizationRemarkEmitter &(Function *)>;
924

925
  OpenMPOpt(SmallVectorImpl<Function *> &SCC, CallGraphUpdater &CGUpdater,
926
            OptimizationRemarkGetter OREGetter,
927
            OMPInformationCache &OMPInfoCache, Attributor &A)
928
      : M(*(*SCC.begin())->getParent()), SCC(SCC), CGUpdater(CGUpdater),
929
        OREGetter(OREGetter), OMPInfoCache(OMPInfoCache), A(A) {}
930

931
  /// Check if any remarks are enabled for openmp-opt
932
  bool remarksEnabled() {
933
    auto &Ctx = M.getContext();
934
    return Ctx.getDiagHandlerPtr()->isAnyRemarkEnabled(DEBUG_TYPE);
935
  }
936

937
  /// Run all OpenMP optimizations on the underlying SCC.
938
  bool run(bool IsModulePass) {
939
    if (SCC.empty())
940
      return false;
941

942
    bool Changed = false;
943

944
    LLVM_DEBUG(dbgs() << TAG << "Run on SCC with " << SCC.size()
945
                      << " functions\n");
946

947
    if (IsModulePass) {
948
      Changed |= runAttributor(IsModulePass);
949

950
      // Recollect uses, in case Attributor deleted any.
951
      OMPInfoCache.recollectUses();
952

953
      // TODO: This should be folded into buildCustomStateMachine.
954
      Changed |= rewriteDeviceCodeStateMachine();
955

956
      if (remarksEnabled())
957
        analysisGlobalization();
958
    } else {
959
      if (PrintICVValues)
960
        printICVs();
961
      if (PrintOpenMPKernels)
962
        printKernels();
963

964
      Changed |= runAttributor(IsModulePass);
965

966
      // Recollect uses, in case Attributor deleted any.
967
      OMPInfoCache.recollectUses();
968

969
      Changed |= deleteParallelRegions();
970

971
      if (HideMemoryTransferLatency)
972
        Changed |= hideMemTransfersLatency();
973
      Changed |= deduplicateRuntimeCalls();
974
      if (EnableParallelRegionMerging) {
975
        if (mergeParallelRegions()) {
976
          deduplicateRuntimeCalls();
977
          Changed = true;
978
        }
979
      }
980
    }
981

982
    if (OMPInfoCache.OpenMPPostLink)
983
      Changed |= removeRuntimeSymbols();
984

985
    return Changed;
986
  }
987

988
  /// Print initial ICV values for testing.
989
  /// FIXME: This should be done from the Attributor once it is added.
990
  void printICVs() const {
991
    InternalControlVar ICVs[] = {ICV_nthreads, ICV_active_levels, ICV_cancel,
992
                                 ICV_proc_bind};
993

994
    for (Function *F : SCC) {
995
      for (auto ICV : ICVs) {
996
        auto ICVInfo = OMPInfoCache.ICVs[ICV];
997
        auto Remark = [&](OptimizationRemarkAnalysis ORA) {
998
          return ORA << "OpenMP ICV " << ore::NV("OpenMPICV", ICVInfo.Name)
999
                     << " Value: "
1000
                     << (ICVInfo.InitValue
1001
                             ? toString(ICVInfo.InitValue->getValue(), 10, true)
1002
                             : "IMPLEMENTATION_DEFINED");
1003
        };
1004

1005
        emitRemark<OptimizationRemarkAnalysis>(F, "OpenMPICVTracker", Remark);
1006
      }
1007
    }
1008
  }
1009

1010
  /// Print OpenMP GPU kernels for testing.
1011
  void printKernels() const {
1012
    for (Function *F : SCC) {
1013
      if (!omp::isOpenMPKernel(*F))
1014
        continue;
1015

1016
      auto Remark = [&](OptimizationRemarkAnalysis ORA) {
1017
        return ORA << "OpenMP GPU kernel "
1018
                   << ore::NV("OpenMPGPUKernel", F->getName()) << "\n";
1019
      };
1020

1021
      emitRemark<OptimizationRemarkAnalysis>(F, "OpenMPGPU", Remark);
1022
    }
1023
  }
1024

1025
  /// Return the call if \p U is a callee use in a regular call. If \p RFI is
1026
  /// given it has to be the callee or a nullptr is returned.
1027
  static CallInst *getCallIfRegularCall(
1028
      Use &U, OMPInformationCache::RuntimeFunctionInfo *RFI = nullptr) {
1029
    CallInst *CI = dyn_cast<CallInst>(U.getUser());
1030
    if (CI && CI->isCallee(&U) && !CI->hasOperandBundles() &&
1031
        (!RFI ||
1032
         (RFI->Declaration && CI->getCalledFunction() == RFI->Declaration)))
1033
      return CI;
1034
    return nullptr;
1035
  }
1036

1037
  /// Return the call if \p V is a regular call. If \p RFI is given it has to be
1038
  /// the callee or a nullptr is returned.
1039
  static CallInst *getCallIfRegularCall(
1040
      Value &V, OMPInformationCache::RuntimeFunctionInfo *RFI = nullptr) {
1041
    CallInst *CI = dyn_cast<CallInst>(&V);
1042
    if (CI && !CI->hasOperandBundles() &&
1043
        (!RFI ||
1044
         (RFI->Declaration && CI->getCalledFunction() == RFI->Declaration)))
1045
      return CI;
1046
    return nullptr;
1047
  }
1048

1049
private:
1050
  /// Merge parallel regions when it is safe.
1051
  bool mergeParallelRegions() {
1052
    const unsigned CallbackCalleeOperand = 2;
1053
    const unsigned CallbackFirstArgOperand = 3;
1054
    using InsertPointTy = OpenMPIRBuilder::InsertPointTy;
1055

1056
    // Check if there are any __kmpc_fork_call calls to merge.
1057
    OMPInformationCache::RuntimeFunctionInfo &RFI =
1058
        OMPInfoCache.RFIs[OMPRTL___kmpc_fork_call];
1059

1060
    if (!RFI.Declaration)
1061
      return false;
1062

1063
    // Unmergable calls that prevent merging a parallel region.
1064
    OMPInformationCache::RuntimeFunctionInfo UnmergableCallsInfo[] = {
1065
        OMPInfoCache.RFIs[OMPRTL___kmpc_push_proc_bind],
1066
        OMPInfoCache.RFIs[OMPRTL___kmpc_push_num_threads],
1067
    };
1068

1069
    bool Changed = false;
1070
    LoopInfo *LI = nullptr;
1071
    DominatorTree *DT = nullptr;
1072

1073
    SmallDenseMap<BasicBlock *, SmallPtrSet<Instruction *, 4>> BB2PRMap;
1074

1075
    BasicBlock *StartBB = nullptr, *EndBB = nullptr;
1076
    auto BodyGenCB = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP) {
1077
      BasicBlock *CGStartBB = CodeGenIP.getBlock();
1078
      BasicBlock *CGEndBB =
1079
          SplitBlock(CGStartBB, &*CodeGenIP.getPoint(), DT, LI);
1080
      assert(StartBB != nullptr && "StartBB should not be null");
1081
      CGStartBB->getTerminator()->setSuccessor(0, StartBB);
1082
      assert(EndBB != nullptr && "EndBB should not be null");
1083
      EndBB->getTerminator()->setSuccessor(0, CGEndBB);
1084
    };
1085

1086
    auto PrivCB = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP, Value &,
1087
                      Value &Inner, Value *&ReplacementValue) -> InsertPointTy {
1088
      ReplacementValue = &Inner;
1089
      return CodeGenIP;
1090
    };
1091

1092
    auto FiniCB = [&](InsertPointTy CodeGenIP) {};
1093

1094
    /// Create a sequential execution region within a merged parallel region,
1095
    /// encapsulated in a master construct with a barrier for synchronization.
1096
    auto CreateSequentialRegion = [&](Function *OuterFn,
1097
                                      BasicBlock *OuterPredBB,
1098
                                      Instruction *SeqStartI,
1099
                                      Instruction *SeqEndI) {
1100
      // Isolate the instructions of the sequential region to a separate
1101
      // block.
1102
      BasicBlock *ParentBB = SeqStartI->getParent();
1103
      BasicBlock *SeqEndBB =
1104
          SplitBlock(ParentBB, SeqEndI->getNextNode(), DT, LI);
1105
      BasicBlock *SeqAfterBB =
1106
          SplitBlock(SeqEndBB, &*SeqEndBB->getFirstInsertionPt(), DT, LI);
1107
      BasicBlock *SeqStartBB =
1108
          SplitBlock(ParentBB, SeqStartI, DT, LI, nullptr, "seq.par.merged");
1109

1110
      assert(ParentBB->getUniqueSuccessor() == SeqStartBB &&
1111
             "Expected a different CFG");
1112
      const DebugLoc DL = ParentBB->getTerminator()->getDebugLoc();
1113
      ParentBB->getTerminator()->eraseFromParent();
1114

1115
      auto BodyGenCB = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP) {
1116
        BasicBlock *CGStartBB = CodeGenIP.getBlock();
1117
        BasicBlock *CGEndBB =
1118
            SplitBlock(CGStartBB, &*CodeGenIP.getPoint(), DT, LI);
1119
        assert(SeqStartBB != nullptr && "SeqStartBB should not be null");
1120
        CGStartBB->getTerminator()->setSuccessor(0, SeqStartBB);
1121
        assert(SeqEndBB != nullptr && "SeqEndBB should not be null");
1122
        SeqEndBB->getTerminator()->setSuccessor(0, CGEndBB);
1123
      };
1124
      auto FiniCB = [&](InsertPointTy CodeGenIP) {};
1125

1126
      // Find outputs from the sequential region to outside users and
1127
      // broadcast their values to them.
1128
      for (Instruction &I : *SeqStartBB) {
1129
        SmallPtrSet<Instruction *, 4> OutsideUsers;
1130
        for (User *Usr : I.users()) {
1131
          Instruction &UsrI = *cast<Instruction>(Usr);
1132
          // Ignore outputs to LT intrinsics, code extraction for the merged
1133
          // parallel region will fix them.
1134
          if (UsrI.isLifetimeStartOrEnd())
1135
            continue;
1136

1137
          if (UsrI.getParent() != SeqStartBB)
1138
            OutsideUsers.insert(&UsrI);
1139
        }
1140

1141
        if (OutsideUsers.empty())
1142
          continue;
1143

1144
        // Emit an alloca in the outer region to store the broadcasted
1145
        // value.
1146
        const DataLayout &DL = M.getDataLayout();
1147
        AllocaInst *AllocaI = new AllocaInst(
1148
            I.getType(), DL.getAllocaAddrSpace(), nullptr,
1149
            I.getName() + ".seq.output.alloc", OuterFn->front().begin());
1150

1151
        // Emit a store instruction in the sequential BB to update the
1152
        // value.
1153
        new StoreInst(&I, AllocaI, SeqStartBB->getTerminator()->getIterator());
1154

1155
        // Emit a load instruction and replace the use of the output value
1156
        // with it.
1157
        for (Instruction *UsrI : OutsideUsers) {
1158
          LoadInst *LoadI = new LoadInst(I.getType(), AllocaI,
1159
                                         I.getName() + ".seq.output.load",
1160
                                         UsrI->getIterator());
1161
          UsrI->replaceUsesOfWith(&I, LoadI);
1162
        }
1163
      }
1164

1165
      OpenMPIRBuilder::LocationDescription Loc(
1166
          InsertPointTy(ParentBB, ParentBB->end()), DL);
1167
      InsertPointTy SeqAfterIP =
1168
          OMPInfoCache.OMPBuilder.createMaster(Loc, BodyGenCB, FiniCB);
1169

1170
      OMPInfoCache.OMPBuilder.createBarrier(SeqAfterIP, OMPD_parallel);
1171

1172
      BranchInst::Create(SeqAfterBB, SeqAfterIP.getBlock());
1173

1174
      LLVM_DEBUG(dbgs() << TAG << "After sequential inlining " << *OuterFn
1175
                        << "\n");
1176
    };
1177

1178
    // Helper to merge the __kmpc_fork_call calls in MergableCIs. They are all
1179
    // contained in BB and only separated by instructions that can be
1180
    // redundantly executed in parallel. The block BB is split before the first
1181
    // call (in MergableCIs) and after the last so the entire region we merge
1182
    // into a single parallel region is contained in a single basic block
1183
    // without any other instructions. We use the OpenMPIRBuilder to outline
1184
    // that block and call the resulting function via __kmpc_fork_call.
1185
    auto Merge = [&](const SmallVectorImpl<CallInst *> &MergableCIs,
1186
                     BasicBlock *BB) {
1187
      // TODO: Change the interface to allow single CIs expanded, e.g, to
1188
      // include an outer loop.
1189
      assert(MergableCIs.size() > 1 && "Assumed multiple mergable CIs");
1190

1191
      auto Remark = [&](OptimizationRemark OR) {
1192
        OR << "Parallel region merged with parallel region"
1193
           << (MergableCIs.size() > 2 ? "s" : "") << " at ";
1194
        for (auto *CI : llvm::drop_begin(MergableCIs)) {
1195
          OR << ore::NV("OpenMPParallelMerge", CI->getDebugLoc());
1196
          if (CI != MergableCIs.back())
1197
            OR << ", ";
1198
        }
1199
        return OR << ".";
1200
      };
1201

1202
      emitRemark<OptimizationRemark>(MergableCIs.front(), "OMP150", Remark);
1203

1204
      Function *OriginalFn = BB->getParent();
1205
      LLVM_DEBUG(dbgs() << TAG << "Merge " << MergableCIs.size()
1206
                        << " parallel regions in " << OriginalFn->getName()
1207
                        << "\n");
1208

1209
      // Isolate the calls to merge in a separate block.
1210
      EndBB = SplitBlock(BB, MergableCIs.back()->getNextNode(), DT, LI);
1211
      BasicBlock *AfterBB =
1212
          SplitBlock(EndBB, &*EndBB->getFirstInsertionPt(), DT, LI);
1213
      StartBB = SplitBlock(BB, MergableCIs.front(), DT, LI, nullptr,
1214
                           "omp.par.merged");
1215

1216
      assert(BB->getUniqueSuccessor() == StartBB && "Expected a different CFG");
1217
      const DebugLoc DL = BB->getTerminator()->getDebugLoc();
1218
      BB->getTerminator()->eraseFromParent();
1219

1220
      // Create sequential regions for sequential instructions that are
1221
      // in-between mergable parallel regions.
1222
      for (auto *It = MergableCIs.begin(), *End = MergableCIs.end() - 1;
1223
           It != End; ++It) {
1224
        Instruction *ForkCI = *It;
1225
        Instruction *NextForkCI = *(It + 1);
1226

1227
        // Continue if there are not in-between instructions.
1228
        if (ForkCI->getNextNode() == NextForkCI)
1229
          continue;
1230

1231
        CreateSequentialRegion(OriginalFn, BB, ForkCI->getNextNode(),
1232
                               NextForkCI->getPrevNode());
1233
      }
1234

1235
      OpenMPIRBuilder::LocationDescription Loc(InsertPointTy(BB, BB->end()),
1236
                                               DL);
1237
      IRBuilder<>::InsertPoint AllocaIP(
1238
          &OriginalFn->getEntryBlock(),
1239
          OriginalFn->getEntryBlock().getFirstInsertionPt());
1240
      // Create the merged parallel region with default proc binding, to
1241
      // avoid overriding binding settings, and without explicit cancellation.
1242
      InsertPointTy AfterIP = OMPInfoCache.OMPBuilder.createParallel(
1243
          Loc, AllocaIP, BodyGenCB, PrivCB, FiniCB, nullptr, nullptr,
1244
          OMP_PROC_BIND_default, /* IsCancellable */ false);
1245
      BranchInst::Create(AfterBB, AfterIP.getBlock());
1246

1247
      // Perform the actual outlining.
1248
      OMPInfoCache.OMPBuilder.finalize(OriginalFn);
1249

1250
      Function *OutlinedFn = MergableCIs.front()->getCaller();
1251

1252
      // Replace the __kmpc_fork_call calls with direct calls to the outlined
1253
      // callbacks.
1254
      SmallVector<Value *, 8> Args;
1255
      for (auto *CI : MergableCIs) {
1256
        Value *Callee = CI->getArgOperand(CallbackCalleeOperand);
1257
        FunctionType *FT = OMPInfoCache.OMPBuilder.ParallelTask;
1258
        Args.clear();
1259
        Args.push_back(OutlinedFn->getArg(0));
1260
        Args.push_back(OutlinedFn->getArg(1));
1261
        for (unsigned U = CallbackFirstArgOperand, E = CI->arg_size(); U < E;
1262
             ++U)
1263
          Args.push_back(CI->getArgOperand(U));
1264

1265
        CallInst *NewCI =
1266
            CallInst::Create(FT, Callee, Args, "", CI->getIterator());
1267
        if (CI->getDebugLoc())
1268
          NewCI->setDebugLoc(CI->getDebugLoc());
1269

1270
        // Forward parameter attributes from the callback to the callee.
1271
        for (unsigned U = CallbackFirstArgOperand, E = CI->arg_size(); U < E;
1272
             ++U)
1273
          for (const Attribute &A : CI->getAttributes().getParamAttrs(U))
1274
            NewCI->addParamAttr(
1275
                U - (CallbackFirstArgOperand - CallbackCalleeOperand), A);
1276

1277
        // Emit an explicit barrier to replace the implicit fork-join barrier.
1278
        if (CI != MergableCIs.back()) {
1279
          // TODO: Remove barrier if the merged parallel region includes the
1280
          // 'nowait' clause.
1281
          OMPInfoCache.OMPBuilder.createBarrier(
1282
              InsertPointTy(NewCI->getParent(),
1283
                            NewCI->getNextNode()->getIterator()),
1284
              OMPD_parallel);
1285
        }
1286

1287
        CI->eraseFromParent();
1288
      }
1289

1290
      assert(OutlinedFn != OriginalFn && "Outlining failed");
1291
      CGUpdater.registerOutlinedFunction(*OriginalFn, *OutlinedFn);
1292
      CGUpdater.reanalyzeFunction(*OriginalFn);
1293

1294
      NumOpenMPParallelRegionsMerged += MergableCIs.size();
1295

1296
      return true;
1297
    };
1298

1299
    // Helper function that identifes sequences of
1300
    // __kmpc_fork_call uses in a basic block.
1301
    auto DetectPRsCB = [&](Use &U, Function &F) {
1302
      CallInst *CI = getCallIfRegularCall(U, &RFI);
1303
      BB2PRMap[CI->getParent()].insert(CI);
1304

1305
      return false;
1306
    };
1307

1308
    BB2PRMap.clear();
1309
    RFI.foreachUse(SCC, DetectPRsCB);
1310
    SmallVector<SmallVector<CallInst *, 4>, 4> MergableCIsVector;
1311
    // Find mergable parallel regions within a basic block that are
1312
    // safe to merge, that is any in-between instructions can safely
1313
    // execute in parallel after merging.
1314
    // TODO: support merging across basic-blocks.
1315
    for (auto &It : BB2PRMap) {
1316
      auto &CIs = It.getSecond();
1317
      if (CIs.size() < 2)
1318
        continue;
1319

1320
      BasicBlock *BB = It.getFirst();
1321
      SmallVector<CallInst *, 4> MergableCIs;
1322

1323
      /// Returns true if the instruction is mergable, false otherwise.
1324
      /// A terminator instruction is unmergable by definition since merging
1325
      /// works within a BB. Instructions before the mergable region are
1326
      /// mergable if they are not calls to OpenMP runtime functions that may
1327
      /// set different execution parameters for subsequent parallel regions.
1328
      /// Instructions in-between parallel regions are mergable if they are not
1329
      /// calls to any non-intrinsic function since that may call a non-mergable
1330
      /// OpenMP runtime function.
1331
      auto IsMergable = [&](Instruction &I, bool IsBeforeMergableRegion) {
1332
        // We do not merge across BBs, hence return false (unmergable) if the
1333
        // instruction is a terminator.
1334
        if (I.isTerminator())
1335
          return false;
1336

1337
        if (!isa<CallInst>(&I))
1338
          return true;
1339

1340
        CallInst *CI = cast<CallInst>(&I);
1341
        if (IsBeforeMergableRegion) {
1342
          Function *CalledFunction = CI->getCalledFunction();
1343
          if (!CalledFunction)
1344
            return false;
1345
          // Return false (unmergable) if the call before the parallel
1346
          // region calls an explicit affinity (proc_bind) or number of
1347
          // threads (num_threads) compiler-generated function. Those settings
1348
          // may be incompatible with following parallel regions.
1349
          // TODO: ICV tracking to detect compatibility.
1350
          for (const auto &RFI : UnmergableCallsInfo) {
1351
            if (CalledFunction == RFI.Declaration)
1352
              return false;
1353
          }
1354
        } else {
1355
          // Return false (unmergable) if there is a call instruction
1356
          // in-between parallel regions when it is not an intrinsic. It
1357
          // may call an unmergable OpenMP runtime function in its callpath.
1358
          // TODO: Keep track of possible OpenMP calls in the callpath.
1359
          if (!isa<IntrinsicInst>(CI))
1360
            return false;
1361
        }
1362

1363
        return true;
1364
      };
1365
      // Find maximal number of parallel region CIs that are safe to merge.
1366
      for (auto It = BB->begin(), End = BB->end(); It != End;) {
1367
        Instruction &I = *It;
1368
        ++It;
1369

1370
        if (CIs.count(&I)) {
1371
          MergableCIs.push_back(cast<CallInst>(&I));
1372
          continue;
1373
        }
1374

1375
        // Continue expanding if the instruction is mergable.
1376
        if (IsMergable(I, MergableCIs.empty()))
1377
          continue;
1378

1379
        // Forward the instruction iterator to skip the next parallel region
1380
        // since there is an unmergable instruction which can affect it.
1381
        for (; It != End; ++It) {
1382
          Instruction &SkipI = *It;
1383
          if (CIs.count(&SkipI)) {
1384
            LLVM_DEBUG(dbgs() << TAG << "Skip parallel region " << SkipI
1385
                              << " due to " << I << "\n");
1386
            ++It;
1387
            break;
1388
          }
1389
        }
1390

1391
        // Store mergable regions found.
1392
        if (MergableCIs.size() > 1) {
1393
          MergableCIsVector.push_back(MergableCIs);
1394
          LLVM_DEBUG(dbgs() << TAG << "Found " << MergableCIs.size()
1395
                            << " parallel regions in block " << BB->getName()
1396
                            << " of function " << BB->getParent()->getName()
1397
                            << "\n";);
1398
        }
1399

1400
        MergableCIs.clear();
1401
      }
1402

1403
      if (!MergableCIsVector.empty()) {
1404
        Changed = true;
1405

1406
        for (auto &MergableCIs : MergableCIsVector)
1407
          Merge(MergableCIs, BB);
1408
        MergableCIsVector.clear();
1409
      }
1410
    }
1411

1412
    if (Changed) {
1413
      /// Re-collect use for fork calls, emitted barrier calls, and
1414
      /// any emitted master/end_master calls.
1415
      OMPInfoCache.recollectUsesForFunction(OMPRTL___kmpc_fork_call);
1416
      OMPInfoCache.recollectUsesForFunction(OMPRTL___kmpc_barrier);
1417
      OMPInfoCache.recollectUsesForFunction(OMPRTL___kmpc_master);
1418
      OMPInfoCache.recollectUsesForFunction(OMPRTL___kmpc_end_master);
1419
    }
1420

1421
    return Changed;
1422
  }
1423

1424
  /// Try to delete parallel regions if possible.
1425
  bool deleteParallelRegions() {
1426
    const unsigned CallbackCalleeOperand = 2;
1427

1428
    OMPInformationCache::RuntimeFunctionInfo &RFI =
1429
        OMPInfoCache.RFIs[OMPRTL___kmpc_fork_call];
1430

1431
    if (!RFI.Declaration)
1432
      return false;
1433

1434
    bool Changed = false;
1435
    auto DeleteCallCB = [&](Use &U, Function &) {
1436
      CallInst *CI = getCallIfRegularCall(U);
1437
      if (!CI)
1438
        return false;
1439
      auto *Fn = dyn_cast<Function>(
1440
          CI->getArgOperand(CallbackCalleeOperand)->stripPointerCasts());
1441
      if (!Fn)
1442
        return false;
1443
      if (!Fn->onlyReadsMemory())
1444
        return false;
1445
      if (!Fn->hasFnAttribute(Attribute::WillReturn))
1446
        return false;
1447

1448
      LLVM_DEBUG(dbgs() << TAG << "Delete read-only parallel region in "
1449
                        << CI->getCaller()->getName() << "\n");
1450

1451
      auto Remark = [&](OptimizationRemark OR) {
1452
        return OR << "Removing parallel region with no side-effects.";
1453
      };
1454
      emitRemark<OptimizationRemark>(CI, "OMP160", Remark);
1455

1456
      CI->eraseFromParent();
1457
      Changed = true;
1458
      ++NumOpenMPParallelRegionsDeleted;
1459
      return true;
1460
    };
1461

1462
    RFI.foreachUse(SCC, DeleteCallCB);
1463

1464
    return Changed;
1465
  }
1466

1467
  /// Try to eliminate runtime calls by reusing existing ones.
1468
  bool deduplicateRuntimeCalls() {
1469
    bool Changed = false;
1470

1471
    RuntimeFunction DeduplicableRuntimeCallIDs[] = {
1472
        OMPRTL_omp_get_num_threads,
1473
        OMPRTL_omp_in_parallel,
1474
        OMPRTL_omp_get_cancellation,
1475
        OMPRTL_omp_get_supported_active_levels,
1476
        OMPRTL_omp_get_level,
1477
        OMPRTL_omp_get_ancestor_thread_num,
1478
        OMPRTL_omp_get_team_size,
1479
        OMPRTL_omp_get_active_level,
1480
        OMPRTL_omp_in_final,
1481
        OMPRTL_omp_get_proc_bind,
1482
        OMPRTL_omp_get_num_places,
1483
        OMPRTL_omp_get_num_procs,
1484
        OMPRTL_omp_get_place_num,
1485
        OMPRTL_omp_get_partition_num_places,
1486
        OMPRTL_omp_get_partition_place_nums};
1487

1488
    // Global-tid is handled separately.
1489
    SmallSetVector<Value *, 16> GTIdArgs;
1490
    collectGlobalThreadIdArguments(GTIdArgs);
1491
    LLVM_DEBUG(dbgs() << TAG << "Found " << GTIdArgs.size()
1492
                      << " global thread ID arguments\n");
1493

1494
    for (Function *F : SCC) {
1495
      for (auto DeduplicableRuntimeCallID : DeduplicableRuntimeCallIDs)
1496
        Changed |= deduplicateRuntimeCalls(
1497
            *F, OMPInfoCache.RFIs[DeduplicableRuntimeCallID]);
1498

1499
      // __kmpc_global_thread_num is special as we can replace it with an
1500
      // argument in enough cases to make it worth trying.
1501
      Value *GTIdArg = nullptr;
1502
      for (Argument &Arg : F->args())
1503
        if (GTIdArgs.count(&Arg)) {
1504
          GTIdArg = &Arg;
1505
          break;
1506
        }
1507
      Changed |= deduplicateRuntimeCalls(
1508
          *F, OMPInfoCache.RFIs[OMPRTL___kmpc_global_thread_num], GTIdArg);
1509
    }
1510

1511
    return Changed;
1512
  }
1513

1514
  /// Tries to remove known runtime symbols that are optional from the module.
1515
  bool removeRuntimeSymbols() {
1516
    // The RPC client symbol is defined in `libc` and indicates that something
1517
    // required an RPC server. If its users were all optimized out then we can
1518
    // safely remove it.
1519
    // TODO: This should be somewhere more common in the future.
1520
    if (GlobalVariable *GV = M.getNamedGlobal("__llvm_libc_rpc_client")) {
1521
      if (!GV->getType()->isPointerTy())
1522
        return false;
1523

1524
      Constant *C = GV->getInitializer();
1525
      if (!C)
1526
        return false;
1527

1528
      // Check to see if the only user of the RPC client is the external handle.
1529
      GlobalVariable *Client = dyn_cast<GlobalVariable>(C->stripPointerCasts());
1530
      if (!Client || Client->getNumUses() > 1 ||
1531
          Client->user_back() != GV->getInitializer())
1532
        return false;
1533

1534
      Client->replaceAllUsesWith(PoisonValue::get(Client->getType()));
1535
      Client->eraseFromParent();
1536

1537
      GV->replaceAllUsesWith(PoisonValue::get(GV->getType()));
1538
      GV->eraseFromParent();
1539

1540
      return true;
1541
    }
1542
    return false;
1543
  }
1544

1545
  /// Tries to hide the latency of runtime calls that involve host to
1546
  /// device memory transfers by splitting them into their "issue" and "wait"
1547
  /// versions. The "issue" is moved upwards as much as possible. The "wait" is
1548
  /// moved downards as much as possible. The "issue" issues the memory transfer
1549
  /// asynchronously, returning a handle. The "wait" waits in the returned
1550
  /// handle for the memory transfer to finish.
1551
  bool hideMemTransfersLatency() {
1552
    auto &RFI = OMPInfoCache.RFIs[OMPRTL___tgt_target_data_begin_mapper];
1553
    bool Changed = false;
1554
    auto SplitMemTransfers = [&](Use &U, Function &Decl) {
1555
      auto *RTCall = getCallIfRegularCall(U, &RFI);
1556
      if (!RTCall)
1557
        return false;
1558

1559
      OffloadArray OffloadArrays[3];
1560
      if (!getValuesInOffloadArrays(*RTCall, OffloadArrays))
1561
        return false;
1562

1563
      LLVM_DEBUG(dumpValuesInOffloadArrays(OffloadArrays));
1564

1565
      // TODO: Check if can be moved upwards.
1566
      bool WasSplit = false;
1567
      Instruction *WaitMovementPoint = canBeMovedDownwards(*RTCall);
1568
      if (WaitMovementPoint)
1569
        WasSplit = splitTargetDataBeginRTC(*RTCall, *WaitMovementPoint);
1570

1571
      Changed |= WasSplit;
1572
      return WasSplit;
1573
    };
1574
    if (OMPInfoCache.runtimeFnsAvailable(
1575
            {OMPRTL___tgt_target_data_begin_mapper_issue,
1576
             OMPRTL___tgt_target_data_begin_mapper_wait}))
1577
      RFI.foreachUse(SCC, SplitMemTransfers);
1578

1579
    return Changed;
1580
  }
1581

1582
  void analysisGlobalization() {
1583
    auto &RFI = OMPInfoCache.RFIs[OMPRTL___kmpc_alloc_shared];
1584

1585
    auto CheckGlobalization = [&](Use &U, Function &Decl) {
1586
      if (CallInst *CI = getCallIfRegularCall(U, &RFI)) {
1587
        auto Remark = [&](OptimizationRemarkMissed ORM) {
1588
          return ORM
1589
                 << "Found thread data sharing on the GPU. "
1590
                 << "Expect degraded performance due to data globalization.";
1591
        };
1592
        emitRemark<OptimizationRemarkMissed>(CI, "OMP112", Remark);
1593
      }
1594

1595
      return false;
1596
    };
1597

1598
    RFI.foreachUse(SCC, CheckGlobalization);
1599
  }
1600

1601
  /// Maps the values stored in the offload arrays passed as arguments to
1602
  /// \p RuntimeCall into the offload arrays in \p OAs.
1603
  bool getValuesInOffloadArrays(CallInst &RuntimeCall,
1604
                                MutableArrayRef<OffloadArray> OAs) {
1605
    assert(OAs.size() == 3 && "Need space for three offload arrays!");
1606

1607
    // A runtime call that involves memory offloading looks something like:
1608
    // call void @__tgt_target_data_begin_mapper(arg0, arg1,
1609
    //   i8** %offload_baseptrs, i8** %offload_ptrs, i64* %offload_sizes,
1610
    // ...)
1611
    // So, the idea is to access the allocas that allocate space for these
1612
    // offload arrays, offload_baseptrs, offload_ptrs, offload_sizes.
1613
    // Therefore:
1614
    // i8** %offload_baseptrs.
1615
    Value *BasePtrsArg =
1616
        RuntimeCall.getArgOperand(OffloadArray::BasePtrsArgNum);
1617
    // i8** %offload_ptrs.
1618
    Value *PtrsArg = RuntimeCall.getArgOperand(OffloadArray::PtrsArgNum);
1619
    // i8** %offload_sizes.
1620
    Value *SizesArg = RuntimeCall.getArgOperand(OffloadArray::SizesArgNum);
1621

1622
    // Get values stored in **offload_baseptrs.
1623
    auto *V = getUnderlyingObject(BasePtrsArg);
1624
    if (!isa<AllocaInst>(V))
1625
      return false;
1626
    auto *BasePtrsArray = cast<AllocaInst>(V);
1627
    if (!OAs[0].initialize(*BasePtrsArray, RuntimeCall))
1628
      return false;
1629

1630
    // Get values stored in **offload_baseptrs.
1631
    V = getUnderlyingObject(PtrsArg);
1632
    if (!isa<AllocaInst>(V))
1633
      return false;
1634
    auto *PtrsArray = cast<AllocaInst>(V);
1635
    if (!OAs[1].initialize(*PtrsArray, RuntimeCall))
1636
      return false;
1637

1638
    // Get values stored in **offload_sizes.
1639
    V = getUnderlyingObject(SizesArg);
1640
    // If it's a [constant] global array don't analyze it.
1641
    if (isa<GlobalValue>(V))
1642
      return isa<Constant>(V);
1643
    if (!isa<AllocaInst>(V))
1644
      return false;
1645

1646
    auto *SizesArray = cast<AllocaInst>(V);
1647
    if (!OAs[2].initialize(*SizesArray, RuntimeCall))
1648
      return false;
1649

1650
    return true;
1651
  }
1652

1653
  /// Prints the values in the OffloadArrays \p OAs using LLVM_DEBUG.
1654
  /// For now this is a way to test that the function getValuesInOffloadArrays
1655
  /// is working properly.
1656
  /// TODO: Move this to a unittest when unittests are available for OpenMPOpt.
1657
  void dumpValuesInOffloadArrays(ArrayRef<OffloadArray> OAs) {
1658
    assert(OAs.size() == 3 && "There are three offload arrays to debug!");
1659

1660
    LLVM_DEBUG(dbgs() << TAG << " Successfully got offload values:\n");
1661
    std::string ValuesStr;
1662
    raw_string_ostream Printer(ValuesStr);
1663
    std::string Separator = " --- ";
1664

1665
    for (auto *BP : OAs[0].StoredValues) {
1666
      BP->print(Printer);
1667
      Printer << Separator;
1668
    }
1669
    LLVM_DEBUG(dbgs() << "\t\toffload_baseptrs: " << ValuesStr << "\n");
1670
    ValuesStr.clear();
1671

1672
    for (auto *P : OAs[1].StoredValues) {
1673
      P->print(Printer);
1674
      Printer << Separator;
1675
    }
1676
    LLVM_DEBUG(dbgs() << "\t\toffload_ptrs: " << ValuesStr << "\n");
1677
    ValuesStr.clear();
1678

1679
    for (auto *S : OAs[2].StoredValues) {
1680
      S->print(Printer);
1681
      Printer << Separator;
1682
    }
1683
    LLVM_DEBUG(dbgs() << "\t\toffload_sizes: " << ValuesStr << "\n");
1684
  }
1685

1686
  /// Returns the instruction where the "wait" counterpart \p RuntimeCall can be
1687
  /// moved. Returns nullptr if the movement is not possible, or not worth it.
1688
  Instruction *canBeMovedDownwards(CallInst &RuntimeCall) {
1689
    // FIXME: This traverses only the BasicBlock where RuntimeCall is.
1690
    //  Make it traverse the CFG.
1691

1692
    Instruction *CurrentI = &RuntimeCall;
1693
    bool IsWorthIt = false;
1694
    while ((CurrentI = CurrentI->getNextNode())) {
1695

1696
      // TODO: Once we detect the regions to be offloaded we should use the
1697
      //  alias analysis manager to check if CurrentI may modify one of
1698
      //  the offloaded regions.
1699
      if (CurrentI->mayHaveSideEffects() || CurrentI->mayReadFromMemory()) {
1700
        if (IsWorthIt)
1701
          return CurrentI;
1702

1703
        return nullptr;
1704
      }
1705

1706
      // FIXME: For now if we move it over anything without side effect
1707
      //  is worth it.
1708
      IsWorthIt = true;
1709
    }
1710

1711
    // Return end of BasicBlock.
1712
    return RuntimeCall.getParent()->getTerminator();
1713
  }
1714

1715
  /// Splits \p RuntimeCall into its "issue" and "wait" counterparts.
1716
  bool splitTargetDataBeginRTC(CallInst &RuntimeCall,
1717
                               Instruction &WaitMovementPoint) {
1718
    // Create stack allocated handle (__tgt_async_info) at the beginning of the
1719
    // function. Used for storing information of the async transfer, allowing to
1720
    // wait on it later.
1721
    auto &IRBuilder = OMPInfoCache.OMPBuilder;
1722
    Function *F = RuntimeCall.getCaller();
1723
    BasicBlock &Entry = F->getEntryBlock();
1724
    IRBuilder.Builder.SetInsertPoint(&Entry,
1725
                                     Entry.getFirstNonPHIOrDbgOrAlloca());
1726
    Value *Handle = IRBuilder.Builder.CreateAlloca(
1727
        IRBuilder.AsyncInfo, /*ArraySize=*/nullptr, "handle");
1728
    Handle =
1729
        IRBuilder.Builder.CreateAddrSpaceCast(Handle, IRBuilder.AsyncInfoPtr);
1730

1731
    // Add "issue" runtime call declaration:
1732
    // declare %struct.tgt_async_info @__tgt_target_data_begin_issue(i64, i32,
1733
    //   i8**, i8**, i64*, i64*)
1734
    FunctionCallee IssueDecl = IRBuilder.getOrCreateRuntimeFunction(
1735
        M, OMPRTL___tgt_target_data_begin_mapper_issue);
1736

1737
    // Change RuntimeCall call site for its asynchronous version.
1738
    SmallVector<Value *, 16> Args;
1739
    for (auto &Arg : RuntimeCall.args())
1740
      Args.push_back(Arg.get());
1741
    Args.push_back(Handle);
1742

1743
    CallInst *IssueCallsite = CallInst::Create(IssueDecl, Args, /*NameStr=*/"",
1744
                                               RuntimeCall.getIterator());
1745
    OMPInfoCache.setCallingConvention(IssueDecl, IssueCallsite);
1746
    RuntimeCall.eraseFromParent();
1747

1748
    // Add "wait" runtime call declaration:
1749
    // declare void @__tgt_target_data_begin_wait(i64, %struct.__tgt_async_info)
1750
    FunctionCallee WaitDecl = IRBuilder.getOrCreateRuntimeFunction(
1751
        M, OMPRTL___tgt_target_data_begin_mapper_wait);
1752

1753
    Value *WaitParams[2] = {
1754
        IssueCallsite->getArgOperand(
1755
            OffloadArray::DeviceIDArgNum), // device_id.
1756
        Handle                             // handle to wait on.
1757
    };
1758
    CallInst *WaitCallsite = CallInst::Create(
1759
        WaitDecl, WaitParams, /*NameStr=*/"", WaitMovementPoint.getIterator());
1760
    OMPInfoCache.setCallingConvention(WaitDecl, WaitCallsite);
1761

1762
    return true;
1763
  }
1764

1765
  static Value *combinedIdentStruct(Value *CurrentIdent, Value *NextIdent,
1766
                                    bool GlobalOnly, bool &SingleChoice) {
1767
    if (CurrentIdent == NextIdent)
1768
      return CurrentIdent;
1769

1770
    // TODO: Figure out how to actually combine multiple debug locations. For
1771
    //       now we just keep an existing one if there is a single choice.
1772
    if (!GlobalOnly || isa<GlobalValue>(NextIdent)) {
1773
      SingleChoice = !CurrentIdent;
1774
      return NextIdent;
1775
    }
1776
    return nullptr;
1777
  }
1778

1779
  /// Return an `struct ident_t*` value that represents the ones used in the
1780
  /// calls of \p RFI inside of \p F. If \p GlobalOnly is true, we will not
1781
  /// return a local `struct ident_t*`. For now, if we cannot find a suitable
1782
  /// return value we create one from scratch. We also do not yet combine
1783
  /// information, e.g., the source locations, see combinedIdentStruct.
1784
  Value *
1785
  getCombinedIdentFromCallUsesIn(OMPInformationCache::RuntimeFunctionInfo &RFI,
1786
                                 Function &F, bool GlobalOnly) {
1787
    bool SingleChoice = true;
1788
    Value *Ident = nullptr;
1789
    auto CombineIdentStruct = [&](Use &U, Function &Caller) {
1790
      CallInst *CI = getCallIfRegularCall(U, &RFI);
1791
      if (!CI || &F != &Caller)
1792
        return false;
1793
      Ident = combinedIdentStruct(Ident, CI->getArgOperand(0),
1794
                                  /* GlobalOnly */ true, SingleChoice);
1795
      return false;
1796
    };
1797
    RFI.foreachUse(SCC, CombineIdentStruct);
1798

1799
    if (!Ident || !SingleChoice) {
1800
      // The IRBuilder uses the insertion block to get to the module, this is
1801
      // unfortunate but we work around it for now.
1802
      if (!OMPInfoCache.OMPBuilder.getInsertionPoint().getBlock())
1803
        OMPInfoCache.OMPBuilder.updateToLocation(OpenMPIRBuilder::InsertPointTy(
1804
            &F.getEntryBlock(), F.getEntryBlock().begin()));
1805
      // Create a fallback location if non was found.
1806
      // TODO: Use the debug locations of the calls instead.
1807
      uint32_t SrcLocStrSize;
1808
      Constant *Loc =
1809
          OMPInfoCache.OMPBuilder.getOrCreateDefaultSrcLocStr(SrcLocStrSize);
1810
      Ident = OMPInfoCache.OMPBuilder.getOrCreateIdent(Loc, SrcLocStrSize);
1811
    }
1812
    return Ident;
1813
  }
1814

1815
  /// Try to eliminate calls of \p RFI in \p F by reusing an existing one or
1816
  /// \p ReplVal if given.
1817
  bool deduplicateRuntimeCalls(Function &F,
1818
                               OMPInformationCache::RuntimeFunctionInfo &RFI,
1819
                               Value *ReplVal = nullptr) {
1820
    auto *UV = RFI.getUseVector(F);
1821
    if (!UV || UV->size() + (ReplVal != nullptr) < 2)
1822
      return false;
1823

1824
    LLVM_DEBUG(
1825
        dbgs() << TAG << "Deduplicate " << UV->size() << " uses of " << RFI.Name
1826
               << (ReplVal ? " with an existing value\n" : "\n") << "\n");
1827

1828
    assert((!ReplVal || (isa<Argument>(ReplVal) &&
1829
                         cast<Argument>(ReplVal)->getParent() == &F)) &&
1830
           "Unexpected replacement value!");
1831

1832
    // TODO: Use dominance to find a good position instead.
1833
    auto CanBeMoved = [this](CallBase &CB) {
1834
      unsigned NumArgs = CB.arg_size();
1835
      if (NumArgs == 0)
1836
        return true;
1837
      if (CB.getArgOperand(0)->getType() != OMPInfoCache.OMPBuilder.IdentPtr)
1838
        return false;
1839
      for (unsigned U = 1; U < NumArgs; ++U)
1840
        if (isa<Instruction>(CB.getArgOperand(U)))
1841
          return false;
1842
      return true;
1843
    };
1844

1845
    if (!ReplVal) {
1846
      auto *DT =
1847
          OMPInfoCache.getAnalysisResultForFunction<DominatorTreeAnalysis>(F);
1848
      if (!DT)
1849
        return false;
1850
      Instruction *IP = nullptr;
1851
      for (Use *U : *UV) {
1852
        if (CallInst *CI = getCallIfRegularCall(*U, &RFI)) {
1853
          if (IP)
1854
            IP = DT->findNearestCommonDominator(IP, CI);
1855
          else
1856
            IP = CI;
1857
          if (!CanBeMoved(*CI))
1858
            continue;
1859
          if (!ReplVal)
1860
            ReplVal = CI;
1861
        }
1862
      }
1863
      if (!ReplVal)
1864
        return false;
1865
      assert(IP && "Expected insertion point!");
1866
      cast<Instruction>(ReplVal)->moveBefore(IP);
1867
    }
1868

1869
    // If we use a call as a replacement value we need to make sure the ident is
1870
    // valid at the new location. For now we just pick a global one, either
1871
    // existing and used by one of the calls, or created from scratch.
1872
    if (CallBase *CI = dyn_cast<CallBase>(ReplVal)) {
1873
      if (!CI->arg_empty() &&
1874
          CI->getArgOperand(0)->getType() == OMPInfoCache.OMPBuilder.IdentPtr) {
1875
        Value *Ident = getCombinedIdentFromCallUsesIn(RFI, F,
1876
                                                      /* GlobalOnly */ true);
1877
        CI->setArgOperand(0, Ident);
1878
      }
1879
    }
1880

1881
    bool Changed = false;
1882
    auto ReplaceAndDeleteCB = [&](Use &U, Function &Caller) {
1883
      CallInst *CI = getCallIfRegularCall(U, &RFI);
1884
      if (!CI || CI == ReplVal || &F != &Caller)
1885
        return false;
1886
      assert(CI->getCaller() == &F && "Unexpected call!");
1887

1888
      auto Remark = [&](OptimizationRemark OR) {
1889
        return OR << "OpenMP runtime call "
1890
                  << ore::NV("OpenMPOptRuntime", RFI.Name) << " deduplicated.";
1891
      };
1892
      if (CI->getDebugLoc())
1893
        emitRemark<OptimizationRemark>(CI, "OMP170", Remark);
1894
      else
1895
        emitRemark<OptimizationRemark>(&F, "OMP170", Remark);
1896

1897
      CI->replaceAllUsesWith(ReplVal);
1898
      CI->eraseFromParent();
1899
      ++NumOpenMPRuntimeCallsDeduplicated;
1900
      Changed = true;
1901
      return true;
1902
    };
1903
    RFI.foreachUse(SCC, ReplaceAndDeleteCB);
1904

1905
    return Changed;
1906
  }
1907

1908
  /// Collect arguments that represent the global thread id in \p GTIdArgs.
1909
  void collectGlobalThreadIdArguments(SmallSetVector<Value *, 16> &GTIdArgs) {
1910
    // TODO: Below we basically perform a fixpoint iteration with a pessimistic
1911
    //       initialization. We could define an AbstractAttribute instead and
1912
    //       run the Attributor here once it can be run as an SCC pass.
1913

1914
    // Helper to check the argument \p ArgNo at all call sites of \p F for
1915
    // a GTId.
1916
    auto CallArgOpIsGTId = [&](Function &F, unsigned ArgNo, CallInst &RefCI) {
1917
      if (!F.hasLocalLinkage())
1918
        return false;
1919
      for (Use &U : F.uses()) {
1920
        if (CallInst *CI = getCallIfRegularCall(U)) {
1921
          Value *ArgOp = CI->getArgOperand(ArgNo);
1922
          if (CI == &RefCI || GTIdArgs.count(ArgOp) ||
1923
              getCallIfRegularCall(
1924
                  *ArgOp, &OMPInfoCache.RFIs[OMPRTL___kmpc_global_thread_num]))
1925
            continue;
1926
        }
1927
        return false;
1928
      }
1929
      return true;
1930
    };
1931

1932
    // Helper to identify uses of a GTId as GTId arguments.
1933
    auto AddUserArgs = [&](Value &GTId) {
1934
      for (Use &U : GTId.uses())
1935
        if (CallInst *CI = dyn_cast<CallInst>(U.getUser()))
1936
          if (CI->isArgOperand(&U))
1937
            if (Function *Callee = CI->getCalledFunction())
1938
              if (CallArgOpIsGTId(*Callee, U.getOperandNo(), *CI))
1939
                GTIdArgs.insert(Callee->getArg(U.getOperandNo()));
1940
    };
1941

1942
    // The argument users of __kmpc_global_thread_num calls are GTIds.
1943
    OMPInformationCache::RuntimeFunctionInfo &GlobThreadNumRFI =
1944
        OMPInfoCache.RFIs[OMPRTL___kmpc_global_thread_num];
1945

1946
    GlobThreadNumRFI.foreachUse(SCC, [&](Use &U, Function &F) {
1947
      if (CallInst *CI = getCallIfRegularCall(U, &GlobThreadNumRFI))
1948
        AddUserArgs(*CI);
1949
      return false;
1950
    });
1951

1952
    // Transitively search for more arguments by looking at the users of the
1953
    // ones we know already. During the search the GTIdArgs vector is extended
1954
    // so we cannot cache the size nor can we use a range based for.
1955
    for (unsigned U = 0; U < GTIdArgs.size(); ++U)
1956
      AddUserArgs(*GTIdArgs[U]);
1957
  }
1958

1959
  /// Kernel (=GPU) optimizations and utility functions
1960
  ///
1961
  ///{{
1962

1963
  /// Cache to remember the unique kernel for a function.
1964
  DenseMap<Function *, std::optional<Kernel>> UniqueKernelMap;
1965

1966
  /// Find the unique kernel that will execute \p F, if any.
1967
  Kernel getUniqueKernelFor(Function &F);
1968

1969
  /// Find the unique kernel that will execute \p I, if any.
1970
  Kernel getUniqueKernelFor(Instruction &I) {
1971
    return getUniqueKernelFor(*I.getFunction());
1972
  }
1973

1974
  /// Rewrite the device (=GPU) code state machine create in non-SPMD mode in
1975
  /// the cases we can avoid taking the address of a function.
1976
  bool rewriteDeviceCodeStateMachine();
1977

1978
  ///
1979
  ///}}
1980

1981
  /// Emit a remark generically
1982
  ///
1983
  /// This template function can be used to generically emit a remark. The
1984
  /// RemarkKind should be one of the following:
1985
  ///   - OptimizationRemark to indicate a successful optimization attempt
1986
  ///   - OptimizationRemarkMissed to report a failed optimization attempt
1987
  ///   - OptimizationRemarkAnalysis to provide additional information about an
1988
  ///     optimization attempt
1989
  ///
1990
  /// The remark is built using a callback function provided by the caller that
1991
  /// takes a RemarkKind as input and returns a RemarkKind.
1992
  template <typename RemarkKind, typename RemarkCallBack>
1993
  void emitRemark(Instruction *I, StringRef RemarkName,
1994
                  RemarkCallBack &&RemarkCB) const {
1995
    Function *F = I->getParent()->getParent();
1996
    auto &ORE = OREGetter(F);
1997

1998
    if (RemarkName.starts_with("OMP"))
1999
      ORE.emit([&]() {
2000
        return RemarkCB(RemarkKind(DEBUG_TYPE, RemarkName, I))
2001
               << " [" << RemarkName << "]";
2002
      });
2003
    else
2004
      ORE.emit(
2005
          [&]() { return RemarkCB(RemarkKind(DEBUG_TYPE, RemarkName, I)); });
2006
  }
2007

2008
  /// Emit a remark on a function.
2009
  template <typename RemarkKind, typename RemarkCallBack>
2010
  void emitRemark(Function *F, StringRef RemarkName,
2011
                  RemarkCallBack &&RemarkCB) const {
2012
    auto &ORE = OREGetter(F);
2013

2014
    if (RemarkName.starts_with("OMP"))
2015
      ORE.emit([&]() {
2016
        return RemarkCB(RemarkKind(DEBUG_TYPE, RemarkName, F))
2017
               << " [" << RemarkName << "]";
2018
      });
2019
    else
2020
      ORE.emit(
2021
          [&]() { return RemarkCB(RemarkKind(DEBUG_TYPE, RemarkName, F)); });
2022
  }
2023

2024
  /// The underlying module.
2025
  Module &M;
2026

2027
  /// The SCC we are operating on.
2028
  SmallVectorImpl<Function *> &SCC;
2029

2030
  /// Callback to update the call graph, the first argument is a removed call,
2031
  /// the second an optional replacement call.
2032
  CallGraphUpdater &CGUpdater;
2033

2034
  /// Callback to get an OptimizationRemarkEmitter from a Function *
2035
  OptimizationRemarkGetter OREGetter;
2036

2037
  /// OpenMP-specific information cache. Also Used for Attributor runs.
2038
  OMPInformationCache &OMPInfoCache;
2039

2040
  /// Attributor instance.
2041
  Attributor &A;
2042

2043
  /// Helper function to run Attributor on SCC.
2044
  bool runAttributor(bool IsModulePass) {
2045
    if (SCC.empty())
2046
      return false;
2047

2048
    registerAAs(IsModulePass);
2049

2050
    ChangeStatus Changed = A.run();
2051

2052
    LLVM_DEBUG(dbgs() << "[Attributor] Done with " << SCC.size()
2053
                      << " functions, result: " << Changed << ".\n");
2054

2055
    if (Changed == ChangeStatus::CHANGED)
2056
      OMPInfoCache.invalidateAnalyses();
2057

2058
    return Changed == ChangeStatus::CHANGED;
2059
  }
2060

2061
  void registerFoldRuntimeCall(RuntimeFunction RF);
2062

2063
  /// Populate the Attributor with abstract attribute opportunities in the
2064
  /// functions.
2065
  void registerAAs(bool IsModulePass);
2066

2067
public:
2068
  /// Callback to register AAs for live functions, including internal functions
2069
  /// marked live during the traversal.
2070
  static void registerAAsForFunction(Attributor &A, const Function &F);
2071
};
2072

2073
Kernel OpenMPOpt::getUniqueKernelFor(Function &F) {
2074
  if (OMPInfoCache.CGSCC && !OMPInfoCache.CGSCC->empty() &&
2075
      !OMPInfoCache.CGSCC->contains(&F))
2076
    return nullptr;
2077

2078
  // Use a scope to keep the lifetime of the CachedKernel short.
2079
  {
2080
    std::optional<Kernel> &CachedKernel = UniqueKernelMap[&F];
2081
    if (CachedKernel)
2082
      return *CachedKernel;
2083

2084
    // TODO: We should use an AA to create an (optimistic and callback
2085
    //       call-aware) call graph. For now we stick to simple patterns that
2086
    //       are less powerful, basically the worst fixpoint.
2087
    if (isOpenMPKernel(F)) {
2088
      CachedKernel = Kernel(&F);
2089
      return *CachedKernel;
2090
    }
2091

2092
    CachedKernel = nullptr;
2093
    if (!F.hasLocalLinkage()) {
2094

2095
      // See https://openmp.llvm.org/remarks/OptimizationRemarks.html
2096
      auto Remark = [&](OptimizationRemarkAnalysis ORA) {
2097
        return ORA << "Potentially unknown OpenMP target region caller.";
2098
      };
2099
      emitRemark<OptimizationRemarkAnalysis>(&F, "OMP100", Remark);
2100

2101
      return nullptr;
2102
    }
2103
  }
2104

2105
  auto GetUniqueKernelForUse = [&](const Use &U) -> Kernel {
2106
    if (auto *Cmp = dyn_cast<ICmpInst>(U.getUser())) {
2107
      // Allow use in equality comparisons.
2108
      if (Cmp->isEquality())
2109
        return getUniqueKernelFor(*Cmp);
2110
      return nullptr;
2111
    }
2112
    if (auto *CB = dyn_cast<CallBase>(U.getUser())) {
2113
      // Allow direct calls.
2114
      if (CB->isCallee(&U))
2115
        return getUniqueKernelFor(*CB);
2116

2117
      OMPInformationCache::RuntimeFunctionInfo &KernelParallelRFI =
2118
          OMPInfoCache.RFIs[OMPRTL___kmpc_parallel_51];
2119
      // Allow the use in __kmpc_parallel_51 calls.
2120
      if (OpenMPOpt::getCallIfRegularCall(*U.getUser(), &KernelParallelRFI))
2121
        return getUniqueKernelFor(*CB);
2122
      return nullptr;
2123
    }
2124
    // Disallow every other use.
2125
    return nullptr;
2126
  };
2127

2128
  // TODO: In the future we want to track more than just a unique kernel.
2129
  SmallPtrSet<Kernel, 2> PotentialKernels;
2130
  OMPInformationCache::foreachUse(F, [&](const Use &U) {
2131
    PotentialKernels.insert(GetUniqueKernelForUse(U));
2132
  });
2133

2134
  Kernel K = nullptr;
2135
  if (PotentialKernels.size() == 1)
2136
    K = *PotentialKernels.begin();
2137

2138
  // Cache the result.
2139
  UniqueKernelMap[&F] = K;
2140

2141
  return K;
2142
}
2143

2144
bool OpenMPOpt::rewriteDeviceCodeStateMachine() {
2145
  OMPInformationCache::RuntimeFunctionInfo &KernelParallelRFI =
2146
      OMPInfoCache.RFIs[OMPRTL___kmpc_parallel_51];
2147

2148
  bool Changed = false;
2149
  if (!KernelParallelRFI)
2150
    return Changed;
2151

2152
  // If we have disabled state machine changes, exit
2153
  if (DisableOpenMPOptStateMachineRewrite)
2154
    return Changed;
2155

2156
  for (Function *F : SCC) {
2157

2158
    // Check if the function is a use in a __kmpc_parallel_51 call at
2159
    // all.
2160
    bool UnknownUse = false;
2161
    bool KernelParallelUse = false;
2162
    unsigned NumDirectCalls = 0;
2163

2164
    SmallVector<Use *, 2> ToBeReplacedStateMachineUses;
2165
    OMPInformationCache::foreachUse(*F, [&](Use &U) {
2166
      if (auto *CB = dyn_cast<CallBase>(U.getUser()))
2167
        if (CB->isCallee(&U)) {
2168
          ++NumDirectCalls;
2169
          return;
2170
        }
2171

2172
      if (isa<ICmpInst>(U.getUser())) {
2173
        ToBeReplacedStateMachineUses.push_back(&U);
2174
        return;
2175
      }
2176

2177
      // Find wrapper functions that represent parallel kernels.
2178
      CallInst *CI =
2179
          OpenMPOpt::getCallIfRegularCall(*U.getUser(), &KernelParallelRFI);
2180
      const unsigned int WrapperFunctionArgNo = 6;
2181
      if (!KernelParallelUse && CI &&
2182
          CI->getArgOperandNo(&U) == WrapperFunctionArgNo) {
2183
        KernelParallelUse = true;
2184
        ToBeReplacedStateMachineUses.push_back(&U);
2185
        return;
2186
      }
2187
      UnknownUse = true;
2188
    });
2189

2190
    // Do not emit a remark if we haven't seen a __kmpc_parallel_51
2191
    // use.
2192
    if (!KernelParallelUse)
2193
      continue;
2194

2195
    // If this ever hits, we should investigate.
2196
    // TODO: Checking the number of uses is not a necessary restriction and
2197
    // should be lifted.
2198
    if (UnknownUse || NumDirectCalls != 1 ||
2199
        ToBeReplacedStateMachineUses.size() > 2) {
2200
      auto Remark = [&](OptimizationRemarkAnalysis ORA) {
2201
        return ORA << "Parallel region is used in "
2202
                   << (UnknownUse ? "unknown" : "unexpected")
2203
                   << " ways. Will not attempt to rewrite the state machine.";
2204
      };
2205
      emitRemark<OptimizationRemarkAnalysis>(F, "OMP101", Remark);
2206
      continue;
2207
    }
2208

2209
    // Even if we have __kmpc_parallel_51 calls, we (for now) give
2210
    // up if the function is not called from a unique kernel.
2211
    Kernel K = getUniqueKernelFor(*F);
2212
    if (!K) {
2213
      auto Remark = [&](OptimizationRemarkAnalysis ORA) {
2214
        return ORA << "Parallel region is not called from a unique kernel. "
2215
                      "Will not attempt to rewrite the state machine.";
2216
      };
2217
      emitRemark<OptimizationRemarkAnalysis>(F, "OMP102", Remark);
2218
      continue;
2219
    }
2220

2221
    // We now know F is a parallel body function called only from the kernel K.
2222
    // We also identified the state machine uses in which we replace the
2223
    // function pointer by a new global symbol for identification purposes. This
2224
    // ensures only direct calls to the function are left.
2225

2226
    Module &M = *F->getParent();
2227
    Type *Int8Ty = Type::getInt8Ty(M.getContext());
2228

2229
    auto *ID = new GlobalVariable(
2230
        M, Int8Ty, /* isConstant */ true, GlobalValue::PrivateLinkage,
2231
        UndefValue::get(Int8Ty), F->getName() + ".ID");
2232

2233
    for (Use *U : ToBeReplacedStateMachineUses)
2234
      U->set(ConstantExpr::getPointerBitCastOrAddrSpaceCast(
2235
          ID, U->get()->getType()));
2236

2237
    ++NumOpenMPParallelRegionsReplacedInGPUStateMachine;
2238

2239
    Changed = true;
2240
  }
2241

2242
  return Changed;
2243
}
2244

2245
/// Abstract Attribute for tracking ICV values.
2246
struct AAICVTracker : public StateWrapper<BooleanState, AbstractAttribute> {
2247
  using Base = StateWrapper<BooleanState, AbstractAttribute>;
2248
  AAICVTracker(const IRPosition &IRP, Attributor &A) : Base(IRP) {}
2249

2250
  /// Returns true if value is assumed to be tracked.
2251
  bool isAssumedTracked() const { return getAssumed(); }
2252

2253
  /// Returns true if value is known to be tracked.
2254
  bool isKnownTracked() const { return getAssumed(); }
2255

2256
  /// Create an abstract attribute biew for the position \p IRP.
2257
  static AAICVTracker &createForPosition(const IRPosition &IRP, Attributor &A);
2258

2259
  /// Return the value with which \p I can be replaced for specific \p ICV.
2260
  virtual std::optional<Value *> getReplacementValue(InternalControlVar ICV,
2261
                                                     const Instruction *I,
2262
                                                     Attributor &A) const {
2263
    return std::nullopt;
2264
  }
2265

2266
  /// Return an assumed unique ICV value if a single candidate is found. If
2267
  /// there cannot be one, return a nullptr. If it is not clear yet, return
2268
  /// std::nullopt.
2269
  virtual std::optional<Value *>
2270
  getUniqueReplacementValue(InternalControlVar ICV) const = 0;
2271

2272
  // Currently only nthreads is being tracked.
2273
  // this array will only grow with time.
2274
  InternalControlVar TrackableICVs[1] = {ICV_nthreads};
2275

2276
  /// See AbstractAttribute::getName()
2277
  const std::string getName() const override { return "AAICVTracker"; }
2278

2279
  /// See AbstractAttribute::getIdAddr()
2280
  const char *getIdAddr() const override { return &ID; }
2281

2282
  /// This function should return true if the type of the \p AA is AAICVTracker
2283
  static bool classof(const AbstractAttribute *AA) {
2284
    return (AA->getIdAddr() == &ID);
2285
  }
2286

2287
  static const char ID;
2288
};
2289

2290
struct AAICVTrackerFunction : public AAICVTracker {
2291
  AAICVTrackerFunction(const IRPosition &IRP, Attributor &A)
2292
      : AAICVTracker(IRP, A) {}
2293

2294
  // FIXME: come up with better string.
2295
  const std::string getAsStr(Attributor *) const override {
2296
    return "ICVTrackerFunction";
2297
  }
2298

2299
  // FIXME: come up with some stats.
2300
  void trackStatistics() const override {}
2301

2302
  /// We don't manifest anything for this AA.
2303
  ChangeStatus manifest(Attributor &A) override {
2304
    return ChangeStatus::UNCHANGED;
2305
  }
2306

2307
  // Map of ICV to their values at specific program point.
2308
  EnumeratedArray<DenseMap<Instruction *, Value *>, InternalControlVar,
2309
                  InternalControlVar::ICV___last>
2310
      ICVReplacementValuesMap;
2311

2312
  ChangeStatus updateImpl(Attributor &A) override {
2313
    ChangeStatus HasChanged = ChangeStatus::UNCHANGED;
2314

2315
    Function *F = getAnchorScope();
2316

2317
    auto &OMPInfoCache = static_cast<OMPInformationCache &>(A.getInfoCache());
2318

2319
    for (InternalControlVar ICV : TrackableICVs) {
2320
      auto &SetterRFI = OMPInfoCache.RFIs[OMPInfoCache.ICVs[ICV].Setter];
2321

2322
      auto &ValuesMap = ICVReplacementValuesMap[ICV];
2323
      auto TrackValues = [&](Use &U, Function &) {
2324
        CallInst *CI = OpenMPOpt::getCallIfRegularCall(U);
2325
        if (!CI)
2326
          return false;
2327

2328
        // FIXME: handle setters with more that 1 arguments.
2329
        /// Track new value.
2330
        if (ValuesMap.insert(std::make_pair(CI, CI->getArgOperand(0))).second)
2331
          HasChanged = ChangeStatus::CHANGED;
2332

2333
        return false;
2334
      };
2335

2336
      auto CallCheck = [&](Instruction &I) {
2337
        std::optional<Value *> ReplVal = getValueForCall(A, I, ICV);
2338
        if (ReplVal && ValuesMap.insert(std::make_pair(&I, *ReplVal)).second)
2339
          HasChanged = ChangeStatus::CHANGED;
2340

2341
        return true;
2342
      };
2343

2344
      // Track all changes of an ICV.
2345
      SetterRFI.foreachUse(TrackValues, F);
2346

2347
      bool UsedAssumedInformation = false;
2348
      A.checkForAllInstructions(CallCheck, *this, {Instruction::Call},
2349
                                UsedAssumedInformation,
2350
                                /* CheckBBLivenessOnly */ true);
2351

2352
      /// TODO: Figure out a way to avoid adding entry in
2353
      /// ICVReplacementValuesMap
2354
      Instruction *Entry = &F->getEntryBlock().front();
2355
      if (HasChanged == ChangeStatus::CHANGED && !ValuesMap.count(Entry))
2356
        ValuesMap.insert(std::make_pair(Entry, nullptr));
2357
    }
2358

2359
    return HasChanged;
2360
  }
2361

2362
  /// Helper to check if \p I is a call and get the value for it if it is
2363
  /// unique.
2364
  std::optional<Value *> getValueForCall(Attributor &A, const Instruction &I,
2365
                                         InternalControlVar &ICV) const {
2366

2367
    const auto *CB = dyn_cast<CallBase>(&I);
2368
    if (!CB || CB->hasFnAttr("no_openmp") ||
2369
        CB->hasFnAttr("no_openmp_routines"))
2370
      return std::nullopt;
2371

2372
    auto &OMPInfoCache = static_cast<OMPInformationCache &>(A.getInfoCache());
2373
    auto &GetterRFI = OMPInfoCache.RFIs[OMPInfoCache.ICVs[ICV].Getter];
2374
    auto &SetterRFI = OMPInfoCache.RFIs[OMPInfoCache.ICVs[ICV].Setter];
2375
    Function *CalledFunction = CB->getCalledFunction();
2376

2377
    // Indirect call, assume ICV changes.
2378
    if (CalledFunction == nullptr)
2379
      return nullptr;
2380
    if (CalledFunction == GetterRFI.Declaration)
2381
      return std::nullopt;
2382
    if (CalledFunction == SetterRFI.Declaration) {
2383
      if (ICVReplacementValuesMap[ICV].count(&I))
2384
        return ICVReplacementValuesMap[ICV].lookup(&I);
2385

2386
      return nullptr;
2387
    }
2388

2389
    // Since we don't know, assume it changes the ICV.
2390
    if (CalledFunction->isDeclaration())
2391
      return nullptr;
2392

2393
    const auto *ICVTrackingAA = A.getAAFor<AAICVTracker>(
2394
        *this, IRPosition::callsite_returned(*CB), DepClassTy::REQUIRED);
2395

2396
    if (ICVTrackingAA->isAssumedTracked()) {
2397
      std::optional<Value *> URV =
2398
          ICVTrackingAA->getUniqueReplacementValue(ICV);
2399
      if (!URV || (*URV && AA::isValidAtPosition(AA::ValueAndContext(**URV, I),
2400
                                                 OMPInfoCache)))
2401
        return URV;
2402
    }
2403

2404
    // If we don't know, assume it changes.
2405
    return nullptr;
2406
  }
2407

2408
  // We don't check unique value for a function, so return std::nullopt.
2409
  std::optional<Value *>
2410
  getUniqueReplacementValue(InternalControlVar ICV) const override {
2411
    return std::nullopt;
2412
  }
2413

2414
  /// Return the value with which \p I can be replaced for specific \p ICV.
2415
  std::optional<Value *> getReplacementValue(InternalControlVar ICV,
2416
                                             const Instruction *I,
2417
                                             Attributor &A) const override {
2418
    const auto &ValuesMap = ICVReplacementValuesMap[ICV];
2419
    if (ValuesMap.count(I))
2420
      return ValuesMap.lookup(I);
2421

2422
    SmallVector<const Instruction *, 16> Worklist;
2423
    SmallPtrSet<const Instruction *, 16> Visited;
2424
    Worklist.push_back(I);
2425

2426
    std::optional<Value *> ReplVal;
2427

2428
    while (!Worklist.empty()) {
2429
      const Instruction *CurrInst = Worklist.pop_back_val();
2430
      if (!Visited.insert(CurrInst).second)
2431
        continue;
2432

2433
      const BasicBlock *CurrBB = CurrInst->getParent();
2434

2435
      // Go up and look for all potential setters/calls that might change the
2436
      // ICV.
2437
      while ((CurrInst = CurrInst->getPrevNode())) {
2438
        if (ValuesMap.count(CurrInst)) {
2439
          std::optional<Value *> NewReplVal = ValuesMap.lookup(CurrInst);
2440
          // Unknown value, track new.
2441
          if (!ReplVal) {
2442
            ReplVal = NewReplVal;
2443
            break;
2444
          }
2445

2446
          // If we found a new value, we can't know the icv value anymore.
2447
          if (NewReplVal)
2448
            if (ReplVal != NewReplVal)
2449
              return nullptr;
2450

2451
          break;
2452
        }
2453

2454
        std::optional<Value *> NewReplVal = getValueForCall(A, *CurrInst, ICV);
2455
        if (!NewReplVal)
2456
          continue;
2457

2458
        // Unknown value, track new.
2459
        if (!ReplVal) {
2460
          ReplVal = NewReplVal;
2461
          break;
2462
        }
2463

2464
        // if (NewReplVal.hasValue())
2465
        // We found a new value, we can't know the icv value anymore.
2466
        if (ReplVal != NewReplVal)
2467
          return nullptr;
2468
      }
2469

2470
      // If we are in the same BB and we have a value, we are done.
2471
      if (CurrBB == I->getParent() && ReplVal)
2472
        return ReplVal;
2473

2474
      // Go through all predecessors and add terminators for analysis.
2475
      for (const BasicBlock *Pred : predecessors(CurrBB))
2476
        if (const Instruction *Terminator = Pred->getTerminator())
2477
          Worklist.push_back(Terminator);
2478
    }
2479

2480
    return ReplVal;
2481
  }
2482
};
2483

2484
struct AAICVTrackerFunctionReturned : AAICVTracker {
2485
  AAICVTrackerFunctionReturned(const IRPosition &IRP, Attributor &A)
2486
      : AAICVTracker(IRP, A) {}
2487

2488
  // FIXME: come up with better string.
2489
  const std::string getAsStr(Attributor *) const override {
2490
    return "ICVTrackerFunctionReturned";
2491
  }
2492

2493
  // FIXME: come up with some stats.
2494
  void trackStatistics() const override {}
2495

2496
  /// We don't manifest anything for this AA.
2497
  ChangeStatus manifest(Attributor &A) override {
2498
    return ChangeStatus::UNCHANGED;
2499
  }
2500

2501
  // Map of ICV to their values at specific program point.
2502
  EnumeratedArray<std::optional<Value *>, InternalControlVar,
2503
                  InternalControlVar::ICV___last>
2504
      ICVReplacementValuesMap;
2505

2506
  /// Return the value with which \p I can be replaced for specific \p ICV.
2507
  std::optional<Value *>
2508
  getUniqueReplacementValue(InternalControlVar ICV) const override {
2509
    return ICVReplacementValuesMap[ICV];
2510
  }
2511

2512
  ChangeStatus updateImpl(Attributor &A) override {
2513
    ChangeStatus Changed = ChangeStatus::UNCHANGED;
2514
    const auto *ICVTrackingAA = A.getAAFor<AAICVTracker>(
2515
        *this, IRPosition::function(*getAnchorScope()), DepClassTy::REQUIRED);
2516

2517
    if (!ICVTrackingAA->isAssumedTracked())
2518
      return indicatePessimisticFixpoint();
2519

2520
    for (InternalControlVar ICV : TrackableICVs) {
2521
      std::optional<Value *> &ReplVal = ICVReplacementValuesMap[ICV];
2522
      std::optional<Value *> UniqueICVValue;
2523

2524
      auto CheckReturnInst = [&](Instruction &I) {
2525
        std::optional<Value *> NewReplVal =
2526
            ICVTrackingAA->getReplacementValue(ICV, &I, A);
2527

2528
        // If we found a second ICV value there is no unique returned value.
2529
        if (UniqueICVValue && UniqueICVValue != NewReplVal)
2530
          return false;
2531

2532
        UniqueICVValue = NewReplVal;
2533

2534
        return true;
2535
      };
2536

2537
      bool UsedAssumedInformation = false;
2538
      if (!A.checkForAllInstructions(CheckReturnInst, *this, {Instruction::Ret},
2539
                                     UsedAssumedInformation,
2540
                                     /* CheckBBLivenessOnly */ true))
2541
        UniqueICVValue = nullptr;
2542

2543
      if (UniqueICVValue == ReplVal)
2544
        continue;
2545

2546
      ReplVal = UniqueICVValue;
2547
      Changed = ChangeStatus::CHANGED;
2548
    }
2549

2550
    return Changed;
2551
  }
2552
};
2553

2554
struct AAICVTrackerCallSite : AAICVTracker {
2555
  AAICVTrackerCallSite(const IRPosition &IRP, Attributor &A)
2556
      : AAICVTracker(IRP, A) {}
2557

2558
  void initialize(Attributor &A) override {
2559
    assert(getAnchorScope() && "Expected anchor function");
2560

2561
    // We only initialize this AA for getters, so we need to know which ICV it
2562
    // gets.
2563
    auto &OMPInfoCache = static_cast<OMPInformationCache &>(A.getInfoCache());
2564
    for (InternalControlVar ICV : TrackableICVs) {
2565
      auto ICVInfo = OMPInfoCache.ICVs[ICV];
2566
      auto &Getter = OMPInfoCache.RFIs[ICVInfo.Getter];
2567
      if (Getter.Declaration == getAssociatedFunction()) {
2568
        AssociatedICV = ICVInfo.Kind;
2569
        return;
2570
      }
2571
    }
2572

2573
    /// Unknown ICV.
2574
    indicatePessimisticFixpoint();
2575
  }
2576

2577
  ChangeStatus manifest(Attributor &A) override {
2578
    if (!ReplVal || !*ReplVal)
2579
      return ChangeStatus::UNCHANGED;
2580

2581
    A.changeAfterManifest(IRPosition::inst(*getCtxI()), **ReplVal);
2582
    A.deleteAfterManifest(*getCtxI());
2583

2584
    return ChangeStatus::CHANGED;
2585
  }
2586

2587
  // FIXME: come up with better string.
2588
  const std::string getAsStr(Attributor *) const override {
2589
    return "ICVTrackerCallSite";
2590
  }
2591

2592
  // FIXME: come up with some stats.
2593
  void trackStatistics() const override {}
2594

2595
  InternalControlVar AssociatedICV;
2596
  std::optional<Value *> ReplVal;
2597

2598
  ChangeStatus updateImpl(Attributor &A) override {
2599
    const auto *ICVTrackingAA = A.getAAFor<AAICVTracker>(
2600
        *this, IRPosition::function(*getAnchorScope()), DepClassTy::REQUIRED);
2601

2602
    // We don't have any information, so we assume it changes the ICV.
2603
    if (!ICVTrackingAA->isAssumedTracked())
2604
      return indicatePessimisticFixpoint();
2605

2606
    std::optional<Value *> NewReplVal =
2607
        ICVTrackingAA->getReplacementValue(AssociatedICV, getCtxI(), A);
2608

2609
    if (ReplVal == NewReplVal)
2610
      return ChangeStatus::UNCHANGED;
2611

2612
    ReplVal = NewReplVal;
2613
    return ChangeStatus::CHANGED;
2614
  }
2615

2616
  // Return the value with which associated value can be replaced for specific
2617
  // \p ICV.
2618
  std::optional<Value *>
2619
  getUniqueReplacementValue(InternalControlVar ICV) const override {
2620
    return ReplVal;
2621
  }
2622
};
2623

2624
struct AAICVTrackerCallSiteReturned : AAICVTracker {
2625
  AAICVTrackerCallSiteReturned(const IRPosition &IRP, Attributor &A)
2626
      : AAICVTracker(IRP, A) {}
2627

2628
  // FIXME: come up with better string.
2629
  const std::string getAsStr(Attributor *) const override {
2630
    return "ICVTrackerCallSiteReturned";
2631
  }
2632

2633
  // FIXME: come up with some stats.
2634
  void trackStatistics() const override {}
2635

2636
  /// We don't manifest anything for this AA.
2637
  ChangeStatus manifest(Attributor &A) override {
2638
    return ChangeStatus::UNCHANGED;
2639
  }
2640

2641
  // Map of ICV to their values at specific program point.
2642
  EnumeratedArray<std::optional<Value *>, InternalControlVar,
2643
                  InternalControlVar::ICV___last>
2644
      ICVReplacementValuesMap;
2645

2646
  /// Return the value with which associated value can be replaced for specific
2647
  /// \p ICV.
2648
  std::optional<Value *>
2649
  getUniqueReplacementValue(InternalControlVar ICV) const override {
2650
    return ICVReplacementValuesMap[ICV];
2651
  }
2652

2653
  ChangeStatus updateImpl(Attributor &A) override {
2654
    ChangeStatus Changed = ChangeStatus::UNCHANGED;
2655
    const auto *ICVTrackingAA = A.getAAFor<AAICVTracker>(
2656
        *this, IRPosition::returned(*getAssociatedFunction()),
2657
        DepClassTy::REQUIRED);
2658

2659
    // We don't have any information, so we assume it changes the ICV.
2660
    if (!ICVTrackingAA->isAssumedTracked())
2661
      return indicatePessimisticFixpoint();
2662

2663
    for (InternalControlVar ICV : TrackableICVs) {
2664
      std::optional<Value *> &ReplVal = ICVReplacementValuesMap[ICV];
2665
      std::optional<Value *> NewReplVal =
2666
          ICVTrackingAA->getUniqueReplacementValue(ICV);
2667

2668
      if (ReplVal == NewReplVal)
2669
        continue;
2670

2671
      ReplVal = NewReplVal;
2672
      Changed = ChangeStatus::CHANGED;
2673
    }
2674
    return Changed;
2675
  }
2676
};
2677

2678
/// Determines if \p BB exits the function unconditionally itself or reaches a
2679
/// block that does through only unique successors.
2680
static bool hasFunctionEndAsUniqueSuccessor(const BasicBlock *BB) {
2681
  if (succ_empty(BB))
2682
    return true;
2683
  const BasicBlock *const Successor = BB->getUniqueSuccessor();
2684
  if (!Successor)
2685
    return false;
2686
  return hasFunctionEndAsUniqueSuccessor(Successor);
2687
}
2688

2689
struct AAExecutionDomainFunction : public AAExecutionDomain {
2690
  AAExecutionDomainFunction(const IRPosition &IRP, Attributor &A)
2691
      : AAExecutionDomain(IRP, A) {}
2692

2693
  ~AAExecutionDomainFunction() { delete RPOT; }
2694

2695
  void initialize(Attributor &A) override {
2696
    Function *F = getAnchorScope();
2697
    assert(F && "Expected anchor function");
2698
    RPOT = new ReversePostOrderTraversal<Function *>(F);
2699
  }
2700

2701
  const std::string getAsStr(Attributor *) const override {
2702
    unsigned TotalBlocks = 0, InitialThreadBlocks = 0, AlignedBlocks = 0;
2703
    for (auto &It : BEDMap) {
2704
      if (!It.getFirst())
2705
        continue;
2706
      TotalBlocks++;
2707
      InitialThreadBlocks += It.getSecond().IsExecutedByInitialThreadOnly;
2708
      AlignedBlocks += It.getSecond().IsReachedFromAlignedBarrierOnly &&
2709
                       It.getSecond().IsReachingAlignedBarrierOnly;
2710
    }
2711
    return "[AAExecutionDomain] " + std::to_string(InitialThreadBlocks) + "/" +
2712
           std::to_string(AlignedBlocks) + " of " +
2713
           std::to_string(TotalBlocks) +
2714
           " executed by initial thread / aligned";
2715
  }
2716

2717
  /// See AbstractAttribute::trackStatistics().
2718
  void trackStatistics() const override {}
2719

2720
  ChangeStatus manifest(Attributor &A) override {
2721
    LLVM_DEBUG({
2722
      for (const BasicBlock &BB : *getAnchorScope()) {
2723
        if (!isExecutedByInitialThreadOnly(BB))
2724
          continue;
2725
        dbgs() << TAG << " Basic block @" << getAnchorScope()->getName() << " "
2726
               << BB.getName() << " is executed by a single thread.\n";
2727
      }
2728
    });
2729

2730
    ChangeStatus Changed = ChangeStatus::UNCHANGED;
2731

2732
    if (DisableOpenMPOptBarrierElimination)
2733
      return Changed;
2734

2735
    SmallPtrSet<CallBase *, 16> DeletedBarriers;
2736
    auto HandleAlignedBarrier = [&](CallBase *CB) {
2737
      const ExecutionDomainTy &ED = CB ? CEDMap[{CB, PRE}] : BEDMap[nullptr];
2738
      if (!ED.IsReachedFromAlignedBarrierOnly ||
2739
          ED.EncounteredNonLocalSideEffect)
2740
        return;
2741
      if (!ED.EncounteredAssumes.empty() && !A.isModulePass())
2742
        return;
2743

2744
      // We can remove this barrier, if it is one, or aligned barriers reaching
2745
      // the kernel end (if CB is nullptr). Aligned barriers reaching the kernel
2746
      // end should only be removed if the kernel end is their unique successor;
2747
      // otherwise, they may have side-effects that aren't accounted for in the
2748
      // kernel end in their other successors. If those barriers have other
2749
      // barriers reaching them, those can be transitively removed as well as
2750
      // long as the kernel end is also their unique successor.
2751
      if (CB) {
2752
        DeletedBarriers.insert(CB);
2753
        A.deleteAfterManifest(*CB);
2754
        ++NumBarriersEliminated;
2755
        Changed = ChangeStatus::CHANGED;
2756
      } else if (!ED.AlignedBarriers.empty()) {
2757
        Changed = ChangeStatus::CHANGED;
2758
        SmallVector<CallBase *> Worklist(ED.AlignedBarriers.begin(),
2759
                                         ED.AlignedBarriers.end());
2760
        SmallSetVector<CallBase *, 16> Visited;
2761
        while (!Worklist.empty()) {
2762
          CallBase *LastCB = Worklist.pop_back_val();
2763
          if (!Visited.insert(LastCB))
2764
            continue;
2765
          if (LastCB->getFunction() != getAnchorScope())
2766
            continue;
2767
          if (!hasFunctionEndAsUniqueSuccessor(LastCB->getParent()))
2768
            continue;
2769
          if (!DeletedBarriers.count(LastCB)) {
2770
            ++NumBarriersEliminated;
2771
            A.deleteAfterManifest(*LastCB);
2772
            continue;
2773
          }
2774
          // The final aligned barrier (LastCB) reaching the kernel end was
2775
          // removed already. This means we can go one step further and remove
2776
          // the barriers encoutered last before (LastCB).
2777
          const ExecutionDomainTy &LastED = CEDMap[{LastCB, PRE}];
2778
          Worklist.append(LastED.AlignedBarriers.begin(),
2779
                          LastED.AlignedBarriers.end());
2780
        }
2781
      }
2782

2783
      // If we actually eliminated a barrier we need to eliminate the associated
2784
      // llvm.assumes as well to avoid creating UB.
2785
      if (!ED.EncounteredAssumes.empty() && (CB || !ED.AlignedBarriers.empty()))
2786
        for (auto *AssumeCB : ED.EncounteredAssumes)
2787
          A.deleteAfterManifest(*AssumeCB);
2788
    };
2789

2790
    for (auto *CB : AlignedBarriers)
2791
      HandleAlignedBarrier(CB);
2792

2793
    // Handle the "kernel end barrier" for kernels too.
2794
    if (omp::isOpenMPKernel(*getAnchorScope()))
2795
      HandleAlignedBarrier(nullptr);
2796

2797
    return Changed;
2798
  }
2799

2800
  bool isNoOpFence(const FenceInst &FI) const override {
2801
    return getState().isValidState() && !NonNoOpFences.count(&FI);
2802
  }
2803

2804
  /// Merge barrier and assumption information from \p PredED into the successor
2805
  /// \p ED.
2806
  void
2807
  mergeInPredecessorBarriersAndAssumptions(Attributor &A, ExecutionDomainTy &ED,
2808
                                           const ExecutionDomainTy &PredED);
2809

2810
  /// Merge all information from \p PredED into the successor \p ED. If
2811
  /// \p InitialEdgeOnly is set, only the initial edge will enter the block
2812
  /// represented by \p ED from this predecessor.
2813
  bool mergeInPredecessor(Attributor &A, ExecutionDomainTy &ED,
2814
                          const ExecutionDomainTy &PredED,
2815
                          bool InitialEdgeOnly = false);
2816

2817
  /// Accumulate information for the entry block in \p EntryBBED.
2818
  bool handleCallees(Attributor &A, ExecutionDomainTy &EntryBBED);
2819

2820
  /// See AbstractAttribute::updateImpl.
2821
  ChangeStatus updateImpl(Attributor &A) override;
2822

2823
  /// Query interface, see AAExecutionDomain
2824
  ///{
2825
  bool isExecutedByInitialThreadOnly(const BasicBlock &BB) const override {
2826
    if (!isValidState())
2827
      return false;
2828
    assert(BB.getParent() == getAnchorScope() && "Block is out of scope!");
2829
    return BEDMap.lookup(&BB).IsExecutedByInitialThreadOnly;
2830
  }
2831

2832
  bool isExecutedInAlignedRegion(Attributor &A,
2833
                                 const Instruction &I) const override {
2834
    assert(I.getFunction() == getAnchorScope() &&
2835
           "Instruction is out of scope!");
2836
    if (!isValidState())
2837
      return false;
2838

2839
    bool ForwardIsOk = true;
2840
    const Instruction *CurI;
2841

2842
    // Check forward until a call or the block end is reached.
2843
    CurI = &I;
2844
    do {
2845
      auto *CB = dyn_cast<CallBase>(CurI);
2846
      if (!CB)
2847
        continue;
2848
      if (CB != &I && AlignedBarriers.contains(const_cast<CallBase *>(CB)))
2849
        return true;
2850
      const auto &It = CEDMap.find({CB, PRE});
2851
      if (It == CEDMap.end())
2852
        continue;
2853
      if (!It->getSecond().IsReachingAlignedBarrierOnly)
2854
        ForwardIsOk = false;
2855
      break;
2856
    } while ((CurI = CurI->getNextNonDebugInstruction()));
2857

2858
    if (!CurI && !BEDMap.lookup(I.getParent()).IsReachingAlignedBarrierOnly)
2859
      ForwardIsOk = false;
2860

2861
    // Check backward until a call or the block beginning is reached.
2862
    CurI = &I;
2863
    do {
2864
      auto *CB = dyn_cast<CallBase>(CurI);
2865
      if (!CB)
2866
        continue;
2867
      if (CB != &I && AlignedBarriers.contains(const_cast<CallBase *>(CB)))
2868
        return true;
2869
      const auto &It = CEDMap.find({CB, POST});
2870
      if (It == CEDMap.end())
2871
        continue;
2872
      if (It->getSecond().IsReachedFromAlignedBarrierOnly)
2873
        break;
2874
      return false;
2875
    } while ((CurI = CurI->getPrevNonDebugInstruction()));
2876

2877
    // Delayed decision on the forward pass to allow aligned barrier detection
2878
    // in the backwards traversal.
2879
    if (!ForwardIsOk)
2880
      return false;
2881

2882
    if (!CurI) {
2883
      const BasicBlock *BB = I.getParent();
2884
      if (BB == &BB->getParent()->getEntryBlock())
2885
        return BEDMap.lookup(nullptr).IsReachedFromAlignedBarrierOnly;
2886
      if (!llvm::all_of(predecessors(BB), [&](const BasicBlock *PredBB) {
2887
            return BEDMap.lookup(PredBB).IsReachedFromAlignedBarrierOnly;
2888
          })) {
2889
        return false;
2890
      }
2891
    }
2892

2893
    // On neither traversal we found a anything but aligned barriers.
2894
    return true;
2895
  }
2896

2897
  ExecutionDomainTy getExecutionDomain(const BasicBlock &BB) const override {
2898
    assert(isValidState() &&
2899
           "No request should be made against an invalid state!");
2900
    return BEDMap.lookup(&BB);
2901
  }
2902
  std::pair<ExecutionDomainTy, ExecutionDomainTy>
2903
  getExecutionDomain(const CallBase &CB) const override {
2904
    assert(isValidState() &&
2905
           "No request should be made against an invalid state!");
2906
    return {CEDMap.lookup({&CB, PRE}), CEDMap.lookup({&CB, POST})};
2907
  }
2908
  ExecutionDomainTy getFunctionExecutionDomain() const override {
2909
    assert(isValidState() &&
2910
           "No request should be made against an invalid state!");
2911
    return InterProceduralED;
2912
  }
2913
  ///}
2914

2915
  // Check if the edge into the successor block contains a condition that only
2916
  // lets the main thread execute it.
2917
  static bool isInitialThreadOnlyEdge(Attributor &A, BranchInst *Edge,
2918
                                      BasicBlock &SuccessorBB) {
2919
    if (!Edge || !Edge->isConditional())
2920
      return false;
2921
    if (Edge->getSuccessor(0) != &SuccessorBB)
2922
      return false;
2923

2924
    auto *Cmp = dyn_cast<CmpInst>(Edge->getCondition());
2925
    if (!Cmp || !Cmp->isTrueWhenEqual() || !Cmp->isEquality())
2926
      return false;
2927

2928
    ConstantInt *C = dyn_cast<ConstantInt>(Cmp->getOperand(1));
2929
    if (!C)
2930
      return false;
2931

2932
    // Match: -1 == __kmpc_target_init (for non-SPMD kernels only!)
2933
    if (C->isAllOnesValue()) {
2934
      auto *CB = dyn_cast<CallBase>(Cmp->getOperand(0));
2935
      auto &OMPInfoCache = static_cast<OMPInformationCache &>(A.getInfoCache());
2936
      auto &RFI = OMPInfoCache.RFIs[OMPRTL___kmpc_target_init];
2937
      CB = CB ? OpenMPOpt::getCallIfRegularCall(*CB, &RFI) : nullptr;
2938
      if (!CB)
2939
        return false;
2940
      ConstantStruct *KernelEnvC =
2941
          KernelInfo::getKernelEnvironementFromKernelInitCB(CB);
2942
      ConstantInt *ExecModeC =
2943
          KernelInfo::getExecModeFromKernelEnvironment(KernelEnvC);
2944
      return ExecModeC->getSExtValue() & OMP_TGT_EXEC_MODE_GENERIC;
2945
    }
2946

2947
    if (C->isZero()) {
2948
      // Match: 0 == llvm.nvvm.read.ptx.sreg.tid.x()
2949
      if (auto *II = dyn_cast<IntrinsicInst>(Cmp->getOperand(0)))
2950
        if (II->getIntrinsicID() == Intrinsic::nvvm_read_ptx_sreg_tid_x)
2951
          return true;
2952

2953
      // Match: 0 == llvm.amdgcn.workitem.id.x()
2954
      if (auto *II = dyn_cast<IntrinsicInst>(Cmp->getOperand(0)))
2955
        if (II->getIntrinsicID() == Intrinsic::amdgcn_workitem_id_x)
2956
          return true;
2957
    }
2958

2959
    return false;
2960
  };
2961

2962
  /// Mapping containing information about the function for other AAs.
2963
  ExecutionDomainTy InterProceduralED;
2964

2965
  enum Direction { PRE = 0, POST = 1 };
2966
  /// Mapping containing information per block.
2967
  DenseMap<const BasicBlock *, ExecutionDomainTy> BEDMap;
2968
  DenseMap<PointerIntPair<const CallBase *, 1, Direction>, ExecutionDomainTy>
2969
      CEDMap;
2970
  SmallSetVector<CallBase *, 16> AlignedBarriers;
2971

2972
  ReversePostOrderTraversal<Function *> *RPOT = nullptr;
2973

2974
  /// Set \p R to \V and report true if that changed \p R.
2975
  static bool setAndRecord(bool &R, bool V) {
2976
    bool Eq = (R == V);
2977
    R = V;
2978
    return !Eq;
2979
  }
2980

2981
  /// Collection of fences known to be non-no-opt. All fences not in this set
2982
  /// can be assumed no-opt.
2983
  SmallPtrSet<const FenceInst *, 8> NonNoOpFences;
2984
};
2985

2986
void AAExecutionDomainFunction::mergeInPredecessorBarriersAndAssumptions(
2987
    Attributor &A, ExecutionDomainTy &ED, const ExecutionDomainTy &PredED) {
2988
  for (auto *EA : PredED.EncounteredAssumes)
2989
    ED.addAssumeInst(A, *EA);
2990

2991
  for (auto *AB : PredED.AlignedBarriers)
2992
    ED.addAlignedBarrier(A, *AB);
2993
}
2994

2995
bool AAExecutionDomainFunction::mergeInPredecessor(
2996
    Attributor &A, ExecutionDomainTy &ED, const ExecutionDomainTy &PredED,
2997
    bool InitialEdgeOnly) {
2998

2999
  bool Changed = false;
3000
  Changed |=
3001
      setAndRecord(ED.IsExecutedByInitialThreadOnly,
3002
                   InitialEdgeOnly || (PredED.IsExecutedByInitialThreadOnly &&
3003
                                       ED.IsExecutedByInitialThreadOnly));
3004

3005
  Changed |= setAndRecord(ED.IsReachedFromAlignedBarrierOnly,
3006
                          ED.IsReachedFromAlignedBarrierOnly &&
3007
                              PredED.IsReachedFromAlignedBarrierOnly);
3008
  Changed |= setAndRecord(ED.EncounteredNonLocalSideEffect,
3009
                          ED.EncounteredNonLocalSideEffect |
3010
                              PredED.EncounteredNonLocalSideEffect);
3011
  // Do not track assumptions and barriers as part of Changed.
3012
  if (ED.IsReachedFromAlignedBarrierOnly)
3013
    mergeInPredecessorBarriersAndAssumptions(A, ED, PredED);
3014
  else
3015
    ED.clearAssumeInstAndAlignedBarriers();
3016
  return Changed;
3017
}
3018

3019
bool AAExecutionDomainFunction::handleCallees(Attributor &A,
3020
                                              ExecutionDomainTy &EntryBBED) {
3021
  SmallVector<std::pair<ExecutionDomainTy, ExecutionDomainTy>, 4> CallSiteEDs;
3022
  auto PredForCallSite = [&](AbstractCallSite ACS) {
3023
    const auto *EDAA = A.getAAFor<AAExecutionDomain>(
3024
        *this, IRPosition::function(*ACS.getInstruction()->getFunction()),
3025
        DepClassTy::OPTIONAL);
3026
    if (!EDAA || !EDAA->getState().isValidState())
3027
      return false;
3028
    CallSiteEDs.emplace_back(
3029
        EDAA->getExecutionDomain(*cast<CallBase>(ACS.getInstruction())));
3030
    return true;
3031
  };
3032

3033
  ExecutionDomainTy ExitED;
3034
  bool AllCallSitesKnown;
3035
  if (A.checkForAllCallSites(PredForCallSite, *this,
3036
                             /* RequiresAllCallSites */ true,
3037
                             AllCallSitesKnown)) {
3038
    for (const auto &[CSInED, CSOutED] : CallSiteEDs) {
3039
      mergeInPredecessor(A, EntryBBED, CSInED);
3040
      ExitED.IsReachingAlignedBarrierOnly &=
3041
          CSOutED.IsReachingAlignedBarrierOnly;
3042
    }
3043

3044
  } else {
3045
    // We could not find all predecessors, so this is either a kernel or a
3046
    // function with external linkage (or with some other weird uses).
3047
    if (omp::isOpenMPKernel(*getAnchorScope())) {
3048
      EntryBBED.IsExecutedByInitialThreadOnly = false;
3049
      EntryBBED.IsReachedFromAlignedBarrierOnly = true;
3050
      EntryBBED.EncounteredNonLocalSideEffect = false;
3051
      ExitED.IsReachingAlignedBarrierOnly = false;
3052
    } else {
3053
      EntryBBED.IsExecutedByInitialThreadOnly = false;
3054
      EntryBBED.IsReachedFromAlignedBarrierOnly = false;
3055
      EntryBBED.EncounteredNonLocalSideEffect = true;
3056
      ExitED.IsReachingAlignedBarrierOnly = false;
3057
    }
3058
  }
3059

3060
  bool Changed = false;
3061
  auto &FnED = BEDMap[nullptr];
3062
  Changed |= setAndRecord(FnED.IsReachedFromAlignedBarrierOnly,
3063
                          FnED.IsReachedFromAlignedBarrierOnly &
3064
                              EntryBBED.IsReachedFromAlignedBarrierOnly);
3065
  Changed |= setAndRecord(FnED.IsReachingAlignedBarrierOnly,
3066
                          FnED.IsReachingAlignedBarrierOnly &
3067
                              ExitED.IsReachingAlignedBarrierOnly);
3068
  Changed |= setAndRecord(FnED.IsExecutedByInitialThreadOnly,
3069
                          EntryBBED.IsExecutedByInitialThreadOnly);
3070
  return Changed;
3071
}
3072

3073
ChangeStatus AAExecutionDomainFunction::updateImpl(Attributor &A) {
3074

3075
  bool Changed = false;
3076

3077
  // Helper to deal with an aligned barrier encountered during the forward
3078
  // traversal. \p CB is the aligned barrier, \p ED is the execution domain when
3079
  // it was encountered.
3080
  auto HandleAlignedBarrier = [&](CallBase &CB, ExecutionDomainTy &ED) {
3081
    Changed |= AlignedBarriers.insert(&CB);
3082
    // First, update the barrier ED kept in the separate CEDMap.
3083
    auto &CallInED = CEDMap[{&CB, PRE}];
3084
    Changed |= mergeInPredecessor(A, CallInED, ED);
3085
    CallInED.IsReachingAlignedBarrierOnly = true;
3086
    // Next adjust the ED we use for the traversal.
3087
    ED.EncounteredNonLocalSideEffect = false;
3088
    ED.IsReachedFromAlignedBarrierOnly = true;
3089
    // Aligned barrier collection has to come last.
3090
    ED.clearAssumeInstAndAlignedBarriers();
3091
    ED.addAlignedBarrier(A, CB);
3092
    auto &CallOutED = CEDMap[{&CB, POST}];
3093
    Changed |= mergeInPredecessor(A, CallOutED, ED);
3094
  };
3095

3096
  auto *LivenessAA =
3097
      A.getAAFor<AAIsDead>(*this, getIRPosition(), DepClassTy::OPTIONAL);
3098

3099
  Function *F = getAnchorScope();
3100
  BasicBlock &EntryBB = F->getEntryBlock();
3101
  bool IsKernel = omp::isOpenMPKernel(*F);
3102

3103
  SmallVector<Instruction *> SyncInstWorklist;
3104
  for (auto &RIt : *RPOT) {
3105
    BasicBlock &BB = *RIt;
3106

3107
    bool IsEntryBB = &BB == &EntryBB;
3108
    // TODO: We use local reasoning since we don't have a divergence analysis
3109
    // 	     running as well. We could basically allow uniform branches here.
3110
    bool AlignedBarrierLastInBlock = IsEntryBB && IsKernel;
3111
    bool IsExplicitlyAligned = IsEntryBB && IsKernel;
3112
    ExecutionDomainTy ED;
3113
    // Propagate "incoming edges" into information about this block.
3114
    if (IsEntryBB) {
3115
      Changed |= handleCallees(A, ED);
3116
    } else {
3117
      // For live non-entry blocks we only propagate
3118
      // information via live edges.
3119
      if (LivenessAA && LivenessAA->isAssumedDead(&BB))
3120
        continue;
3121

3122
      for (auto *PredBB : predecessors(&BB)) {
3123
        if (LivenessAA && LivenessAA->isEdgeDead(PredBB, &BB))
3124
          continue;
3125
        bool InitialEdgeOnly = isInitialThreadOnlyEdge(
3126
            A, dyn_cast<BranchInst>(PredBB->getTerminator()), BB);
3127
        mergeInPredecessor(A, ED, BEDMap[PredBB], InitialEdgeOnly);
3128
      }
3129
    }
3130

3131
    // Now we traverse the block, accumulate effects in ED and attach
3132
    // information to calls.
3133
    for (Instruction &I : BB) {
3134
      bool UsedAssumedInformation;
3135
      if (A.isAssumedDead(I, *this, LivenessAA, UsedAssumedInformation,
3136
                          /* CheckBBLivenessOnly */ false, DepClassTy::OPTIONAL,
3137
                          /* CheckForDeadStore */ true))
3138
        continue;
3139

3140
      // Asummes and "assume-like" (dbg, lifetime, ...) are handled first, the
3141
      // former is collected the latter is ignored.
3142
      if (auto *II = dyn_cast<IntrinsicInst>(&I)) {
3143
        if (auto *AI = dyn_cast_or_null<AssumeInst>(II)) {
3144
          ED.addAssumeInst(A, *AI);
3145
          continue;
3146
        }
3147
        // TODO: Should we also collect and delete lifetime markers?
3148
        if (II->isAssumeLikeIntrinsic())
3149
          continue;
3150
      }
3151

3152
      if (auto *FI = dyn_cast<FenceInst>(&I)) {
3153
        if (!ED.EncounteredNonLocalSideEffect) {
3154
          // An aligned fence without non-local side-effects is a no-op.
3155
          if (ED.IsReachedFromAlignedBarrierOnly)
3156
            continue;
3157
          // A non-aligned fence without non-local side-effects is a no-op
3158
          // if the ordering only publishes non-local side-effects (or less).
3159
          switch (FI->getOrdering()) {
3160
          case AtomicOrdering::NotAtomic:
3161
            continue;
3162
          case AtomicOrdering::Unordered:
3163
            continue;
3164
          case AtomicOrdering::Monotonic:
3165
            continue;
3166
          case AtomicOrdering::Acquire:
3167
            break;
3168
          case AtomicOrdering::Release:
3169
            continue;
3170
          case AtomicOrdering::AcquireRelease:
3171
            break;
3172
          case AtomicOrdering::SequentiallyConsistent:
3173
            break;
3174
          };
3175
        }
3176
        NonNoOpFences.insert(FI);
3177
      }
3178

3179
      auto *CB = dyn_cast<CallBase>(&I);
3180
      bool IsNoSync = AA::isNoSyncInst(A, I, *this);
3181
      bool IsAlignedBarrier =
3182
          !IsNoSync && CB &&
3183
          AANoSync::isAlignedBarrier(*CB, AlignedBarrierLastInBlock);
3184

3185
      AlignedBarrierLastInBlock &= IsNoSync;
3186
      IsExplicitlyAligned &= IsNoSync;
3187

3188
      // Next we check for calls. Aligned barriers are handled
3189
      // explicitly, everything else is kept for the backward traversal and will
3190
      // also affect our state.
3191
      if (CB) {
3192
        if (IsAlignedBarrier) {
3193
          HandleAlignedBarrier(*CB, ED);
3194
          AlignedBarrierLastInBlock = true;
3195
          IsExplicitlyAligned = true;
3196
          continue;
3197
        }
3198

3199
        // Check the pointer(s) of a memory intrinsic explicitly.
3200
        if (isa<MemIntrinsic>(&I)) {
3201
          if (!ED.EncounteredNonLocalSideEffect &&
3202
              AA::isPotentiallyAffectedByBarrier(A, I, *this))
3203
            ED.EncounteredNonLocalSideEffect = true;
3204
          if (!IsNoSync) {
3205
            ED.IsReachedFromAlignedBarrierOnly = false;
3206
            SyncInstWorklist.push_back(&I);
3207
          }
3208
          continue;
3209
        }
3210

3211
        // Record how we entered the call, then accumulate the effect of the
3212
        // call in ED for potential use by the callee.
3213
        auto &CallInED = CEDMap[{CB, PRE}];
3214
        Changed |= mergeInPredecessor(A, CallInED, ED);
3215

3216
        // If we have a sync-definition we can check if it starts/ends in an
3217
        // aligned barrier. If we are unsure we assume any sync breaks
3218
        // alignment.
3219
        Function *Callee = CB->getCalledFunction();
3220
        if (!IsNoSync && Callee && !Callee->isDeclaration()) {
3221
          const auto *EDAA = A.getAAFor<AAExecutionDomain>(
3222
              *this, IRPosition::function(*Callee), DepClassTy::OPTIONAL);
3223
          if (EDAA && EDAA->getState().isValidState()) {
3224
            const auto &CalleeED = EDAA->getFunctionExecutionDomain();
3225
            ED.IsReachedFromAlignedBarrierOnly =
3226
                CalleeED.IsReachedFromAlignedBarrierOnly;
3227
            AlignedBarrierLastInBlock = ED.IsReachedFromAlignedBarrierOnly;
3228
            if (IsNoSync || !CalleeED.IsReachedFromAlignedBarrierOnly)
3229
              ED.EncounteredNonLocalSideEffect |=
3230
                  CalleeED.EncounteredNonLocalSideEffect;
3231
            else
3232
              ED.EncounteredNonLocalSideEffect =
3233
                  CalleeED.EncounteredNonLocalSideEffect;
3234
            if (!CalleeED.IsReachingAlignedBarrierOnly) {
3235
              Changed |=
3236
                  setAndRecord(CallInED.IsReachingAlignedBarrierOnly, false);
3237
              SyncInstWorklist.push_back(&I);
3238
            }
3239
            if (CalleeED.IsReachedFromAlignedBarrierOnly)
3240
              mergeInPredecessorBarriersAndAssumptions(A, ED, CalleeED);
3241
            auto &CallOutED = CEDMap[{CB, POST}];
3242
            Changed |= mergeInPredecessor(A, CallOutED, ED);
3243
            continue;
3244
          }
3245
        }
3246
        if (!IsNoSync) {
3247
          ED.IsReachedFromAlignedBarrierOnly = false;
3248
          Changed |= setAndRecord(CallInED.IsReachingAlignedBarrierOnly, false);
3249
          SyncInstWorklist.push_back(&I);
3250
        }
3251
        AlignedBarrierLastInBlock &= ED.IsReachedFromAlignedBarrierOnly;
3252
        ED.EncounteredNonLocalSideEffect |= !CB->doesNotAccessMemory();
3253
        auto &CallOutED = CEDMap[{CB, POST}];
3254
        Changed |= mergeInPredecessor(A, CallOutED, ED);
3255
      }
3256

3257
      if (!I.mayHaveSideEffects() && !I.mayReadFromMemory())
3258
        continue;
3259

3260
      // If we have a callee we try to use fine-grained information to
3261
      // determine local side-effects.
3262
      if (CB) {
3263
        const auto *MemAA = A.getAAFor<AAMemoryLocation>(
3264
            *this, IRPosition::callsite_function(*CB), DepClassTy::OPTIONAL);
3265

3266
        auto AccessPred = [&](const Instruction *I, const Value *Ptr,
3267
                              AAMemoryLocation::AccessKind,
3268
                              AAMemoryLocation::MemoryLocationsKind) {
3269
          return !AA::isPotentiallyAffectedByBarrier(A, {Ptr}, *this, I);
3270
        };
3271
        if (MemAA && MemAA->getState().isValidState() &&
3272
            MemAA->checkForAllAccessesToMemoryKind(
3273
                AccessPred, AAMemoryLocation::ALL_LOCATIONS))
3274
          continue;
3275
      }
3276

3277
      auto &InfoCache = A.getInfoCache();
3278
      if (!I.mayHaveSideEffects() && InfoCache.isOnlyUsedByAssume(I))
3279
        continue;
3280

3281
      if (auto *LI = dyn_cast<LoadInst>(&I))
3282
        if (LI->hasMetadata(LLVMContext::MD_invariant_load))
3283
          continue;
3284

3285
      if (!ED.EncounteredNonLocalSideEffect &&
3286
          AA::isPotentiallyAffectedByBarrier(A, I, *this))
3287
        ED.EncounteredNonLocalSideEffect = true;
3288
    }
3289

3290
    bool IsEndAndNotReachingAlignedBarriersOnly = false;
3291
    if (!isa<UnreachableInst>(BB.getTerminator()) &&
3292
        !BB.getTerminator()->getNumSuccessors()) {
3293

3294
      Changed |= mergeInPredecessor(A, InterProceduralED, ED);
3295

3296
      auto &FnED = BEDMap[nullptr];
3297
      if (IsKernel && !IsExplicitlyAligned)
3298
        FnED.IsReachingAlignedBarrierOnly = false;
3299
      Changed |= mergeInPredecessor(A, FnED, ED);
3300

3301
      if (!FnED.IsReachingAlignedBarrierOnly) {
3302
        IsEndAndNotReachingAlignedBarriersOnly = true;
3303
        SyncInstWorklist.push_back(BB.getTerminator());
3304
        auto &BBED = BEDMap[&BB];
3305
        Changed |= setAndRecord(BBED.IsReachingAlignedBarrierOnly, false);
3306
      }
3307
    }
3308

3309
    ExecutionDomainTy &StoredED = BEDMap[&BB];
3310
    ED.IsReachingAlignedBarrierOnly = StoredED.IsReachingAlignedBarrierOnly &
3311
                                      !IsEndAndNotReachingAlignedBarriersOnly;
3312

3313
    // Check if we computed anything different as part of the forward
3314
    // traversal. We do not take assumptions and aligned barriers into account
3315
    // as they do not influence the state we iterate. Backward traversal values
3316
    // are handled later on.
3317
    if (ED.IsExecutedByInitialThreadOnly !=
3318
            StoredED.IsExecutedByInitialThreadOnly ||
3319
        ED.IsReachedFromAlignedBarrierOnly !=
3320
            StoredED.IsReachedFromAlignedBarrierOnly ||
3321
        ED.EncounteredNonLocalSideEffect !=
3322
            StoredED.EncounteredNonLocalSideEffect)
3323
      Changed = true;
3324

3325
    // Update the state with the new value.
3326
    StoredED = std::move(ED);
3327
  }
3328

3329
  // Propagate (non-aligned) sync instruction effects backwards until the
3330
  // entry is hit or an aligned barrier.
3331
  SmallSetVector<BasicBlock *, 16> Visited;
3332
  while (!SyncInstWorklist.empty()) {
3333
    Instruction *SyncInst = SyncInstWorklist.pop_back_val();
3334
    Instruction *CurInst = SyncInst;
3335
    bool HitAlignedBarrierOrKnownEnd = false;
3336
    while ((CurInst = CurInst->getPrevNode())) {
3337
      auto *CB = dyn_cast<CallBase>(CurInst);
3338
      if (!CB)
3339
        continue;
3340
      auto &CallOutED = CEDMap[{CB, POST}];
3341
      Changed |= setAndRecord(CallOutED.IsReachingAlignedBarrierOnly, false);
3342
      auto &CallInED = CEDMap[{CB, PRE}];
3343
      HitAlignedBarrierOrKnownEnd =
3344
          AlignedBarriers.count(CB) || !CallInED.IsReachingAlignedBarrierOnly;
3345
      if (HitAlignedBarrierOrKnownEnd)
3346
        break;
3347
      Changed |= setAndRecord(CallInED.IsReachingAlignedBarrierOnly, false);
3348
    }
3349
    if (HitAlignedBarrierOrKnownEnd)
3350
      continue;
3351
    BasicBlock *SyncBB = SyncInst->getParent();
3352
    for (auto *PredBB : predecessors(SyncBB)) {
3353
      if (LivenessAA && LivenessAA->isEdgeDead(PredBB, SyncBB))
3354
        continue;
3355
      if (!Visited.insert(PredBB))
3356
        continue;
3357
      auto &PredED = BEDMap[PredBB];
3358
      if (setAndRecord(PredED.IsReachingAlignedBarrierOnly, false)) {
3359
        Changed = true;
3360
        SyncInstWorklist.push_back(PredBB->getTerminator());
3361
      }
3362
    }
3363
    if (SyncBB != &EntryBB)
3364
      continue;
3365
    Changed |=
3366
        setAndRecord(InterProceduralED.IsReachingAlignedBarrierOnly, false);
3367
  }
3368

3369
  return Changed ? ChangeStatus::CHANGED : ChangeStatus::UNCHANGED;
3370
}
3371

3372
/// Try to replace memory allocation calls called by a single thread with a
3373
/// static buffer of shared memory.
3374
struct AAHeapToShared : public StateWrapper<BooleanState, AbstractAttribute> {
3375
  using Base = StateWrapper<BooleanState, AbstractAttribute>;
3376
  AAHeapToShared(const IRPosition &IRP, Attributor &A) : Base(IRP) {}
3377

3378
  /// Create an abstract attribute view for the position \p IRP.
3379
  static AAHeapToShared &createForPosition(const IRPosition &IRP,
3380
                                           Attributor &A);
3381

3382
  /// Returns true if HeapToShared conversion is assumed to be possible.
3383
  virtual bool isAssumedHeapToShared(CallBase &CB) const = 0;
3384

3385
  /// Returns true if HeapToShared conversion is assumed and the CB is a
3386
  /// callsite to a free operation to be removed.
3387
  virtual bool isAssumedHeapToSharedRemovedFree(CallBase &CB) const = 0;
3388

3389
  /// See AbstractAttribute::getName().
3390
  const std::string getName() const override { return "AAHeapToShared"; }
3391

3392
  /// See AbstractAttribute::getIdAddr().
3393
  const char *getIdAddr() const override { return &ID; }
3394

3395
  /// This function should return true if the type of the \p AA is
3396
  /// AAHeapToShared.
3397
  static bool classof(const AbstractAttribute *AA) {
3398
    return (AA->getIdAddr() == &ID);
3399
  }
3400

3401
  /// Unique ID (due to the unique address)
3402
  static const char ID;
3403
};
3404

3405
struct AAHeapToSharedFunction : public AAHeapToShared {
3406
  AAHeapToSharedFunction(const IRPosition &IRP, Attributor &A)
3407
      : AAHeapToShared(IRP, A) {}
3408

3409
  const std::string getAsStr(Attributor *) const override {
3410
    return "[AAHeapToShared] " + std::to_string(MallocCalls.size()) +
3411
           " malloc calls eligible.";
3412
  }
3413

3414
  /// See AbstractAttribute::trackStatistics().
3415
  void trackStatistics() const override {}
3416

3417
  /// This functions finds free calls that will be removed by the
3418
  /// HeapToShared transformation.
3419
  void findPotentialRemovedFreeCalls(Attributor &A) {
3420
    auto &OMPInfoCache = static_cast<OMPInformationCache &>(A.getInfoCache());
3421
    auto &FreeRFI = OMPInfoCache.RFIs[OMPRTL___kmpc_free_shared];
3422

3423
    PotentialRemovedFreeCalls.clear();
3424
    // Update free call users of found malloc calls.
3425
    for (CallBase *CB : MallocCalls) {
3426
      SmallVector<CallBase *, 4> FreeCalls;
3427
      for (auto *U : CB->users()) {
3428
        CallBase *C = dyn_cast<CallBase>(U);
3429
        if (C && C->getCalledFunction() == FreeRFI.Declaration)
3430
          FreeCalls.push_back(C);
3431
      }
3432

3433
      if (FreeCalls.size() != 1)
3434
        continue;
3435

3436
      PotentialRemovedFreeCalls.insert(FreeCalls.front());
3437
    }
3438
  }
3439

3440
  void initialize(Attributor &A) override {
3441
    if (DisableOpenMPOptDeglobalization) {
3442
      indicatePessimisticFixpoint();
3443
      return;
3444
    }
3445

3446
    auto &OMPInfoCache = static_cast<OMPInformationCache &>(A.getInfoCache());
3447
    auto &RFI = OMPInfoCache.RFIs[OMPRTL___kmpc_alloc_shared];
3448
    if (!RFI.Declaration)
3449
      return;
3450

3451
    Attributor::SimplifictionCallbackTy SCB =
3452
        [](const IRPosition &, const AbstractAttribute *,
3453
           bool &) -> std::optional<Value *> { return nullptr; };
3454

3455
    Function *F = getAnchorScope();
3456
    for (User *U : RFI.Declaration->users())
3457
      if (CallBase *CB = dyn_cast<CallBase>(U)) {
3458
        if (CB->getFunction() != F)
3459
          continue;
3460
        MallocCalls.insert(CB);
3461
        A.registerSimplificationCallback(IRPosition::callsite_returned(*CB),
3462
                                         SCB);
3463
      }
3464

3465
    findPotentialRemovedFreeCalls(A);
3466
  }
3467

3468
  bool isAssumedHeapToShared(CallBase &CB) const override {
3469
    return isValidState() && MallocCalls.count(&CB);
3470
  }
3471

3472
  bool isAssumedHeapToSharedRemovedFree(CallBase &CB) const override {
3473
    return isValidState() && PotentialRemovedFreeCalls.count(&CB);
3474
  }
3475

3476
  ChangeStatus manifest(Attributor &A) override {
3477
    if (MallocCalls.empty())
3478
      return ChangeStatus::UNCHANGED;
3479

3480
    auto &OMPInfoCache = static_cast<OMPInformationCache &>(A.getInfoCache());
3481
    auto &FreeCall = OMPInfoCache.RFIs[OMPRTL___kmpc_free_shared];
3482

3483
    Function *F = getAnchorScope();
3484
    auto *HS = A.lookupAAFor<AAHeapToStack>(IRPosition::function(*F), this,
3485
                                            DepClassTy::OPTIONAL);
3486

3487
    ChangeStatus Changed = ChangeStatus::UNCHANGED;
3488
    for (CallBase *CB : MallocCalls) {
3489
      // Skip replacing this if HeapToStack has already claimed it.
3490
      if (HS && HS->isAssumedHeapToStack(*CB))
3491
        continue;
3492

3493
      // Find the unique free call to remove it.
3494
      SmallVector<CallBase *, 4> FreeCalls;
3495
      for (auto *U : CB->users()) {
3496
        CallBase *C = dyn_cast<CallBase>(U);
3497
        if (C && C->getCalledFunction() == FreeCall.Declaration)
3498
          FreeCalls.push_back(C);
3499
      }
3500
      if (FreeCalls.size() != 1)
3501
        continue;
3502

3503
      auto *AllocSize = cast<ConstantInt>(CB->getArgOperand(0));
3504

3505
      if (AllocSize->getZExtValue() + SharedMemoryUsed > SharedMemoryLimit) {
3506
        LLVM_DEBUG(dbgs() << TAG << "Cannot replace call " << *CB
3507
                          << " with shared memory."
3508
                          << " Shared memory usage is limited to "
3509
                          << SharedMemoryLimit << " bytes\n");
3510
        continue;
3511
      }
3512

3513
      LLVM_DEBUG(dbgs() << TAG << "Replace globalization call " << *CB
3514
                        << " with " << AllocSize->getZExtValue()
3515
                        << " bytes of shared memory\n");
3516

3517
      // Create a new shared memory buffer of the same size as the allocation
3518
      // and replace all the uses of the original allocation with it.
3519
      Module *M = CB->getModule();
3520
      Type *Int8Ty = Type::getInt8Ty(M->getContext());
3521
      Type *Int8ArrTy = ArrayType::get(Int8Ty, AllocSize->getZExtValue());
3522
      auto *SharedMem = new GlobalVariable(
3523
          *M, Int8ArrTy, /* IsConstant */ false, GlobalValue::InternalLinkage,
3524
          PoisonValue::get(Int8ArrTy), CB->getName() + "_shared", nullptr,
3525
          GlobalValue::NotThreadLocal,
3526
          static_cast<unsigned>(AddressSpace::Shared));
3527
      auto *NewBuffer =
3528
          ConstantExpr::getPointerCast(SharedMem, Int8Ty->getPointerTo());
3529

3530
      auto Remark = [&](OptimizationRemark OR) {
3531
        return OR << "Replaced globalized variable with "
3532
                  << ore::NV("SharedMemory", AllocSize->getZExtValue())
3533
                  << (AllocSize->isOne() ? " byte " : " bytes ")
3534
                  << "of shared memory.";
3535
      };
3536
      A.emitRemark<OptimizationRemark>(CB, "OMP111", Remark);
3537

3538
      MaybeAlign Alignment = CB->getRetAlign();
3539
      assert(Alignment &&
3540
             "HeapToShared on allocation without alignment attribute");
3541
      SharedMem->setAlignment(*Alignment);
3542

3543
      A.changeAfterManifest(IRPosition::callsite_returned(*CB), *NewBuffer);
3544
      A.deleteAfterManifest(*CB);
3545
      A.deleteAfterManifest(*FreeCalls.front());
3546

3547
      SharedMemoryUsed += AllocSize->getZExtValue();
3548
      NumBytesMovedToSharedMemory = SharedMemoryUsed;
3549
      Changed = ChangeStatus::CHANGED;
3550
    }
3551

3552
    return Changed;
3553
  }
3554

3555
  ChangeStatus updateImpl(Attributor &A) override {
3556
    if (MallocCalls.empty())
3557
      return indicatePessimisticFixpoint();
3558
    auto &OMPInfoCache = static_cast<OMPInformationCache &>(A.getInfoCache());
3559
    auto &RFI = OMPInfoCache.RFIs[OMPRTL___kmpc_alloc_shared];
3560
    if (!RFI.Declaration)
3561
      return ChangeStatus::UNCHANGED;
3562

3563
    Function *F = getAnchorScope();
3564

3565
    auto NumMallocCalls = MallocCalls.size();
3566

3567
    // Only consider malloc calls executed by a single thread with a constant.
3568
    for (User *U : RFI.Declaration->users()) {
3569
      if (CallBase *CB = dyn_cast<CallBase>(U)) {
3570
        if (CB->getCaller() != F)
3571
          continue;
3572
        if (!MallocCalls.count(CB))
3573
          continue;
3574
        if (!isa<ConstantInt>(CB->getArgOperand(0))) {
3575
          MallocCalls.remove(CB);
3576
          continue;
3577
        }
3578
        const auto *ED = A.getAAFor<AAExecutionDomain>(
3579
            *this, IRPosition::function(*F), DepClassTy::REQUIRED);
3580
        if (!ED || !ED->isExecutedByInitialThreadOnly(*CB))
3581
          MallocCalls.remove(CB);
3582
      }
3583
    }
3584

3585
    findPotentialRemovedFreeCalls(A);
3586

3587
    if (NumMallocCalls != MallocCalls.size())
3588
      return ChangeStatus::CHANGED;
3589

3590
    return ChangeStatus::UNCHANGED;
3591
  }
3592

3593
  /// Collection of all malloc calls in a function.
3594
  SmallSetVector<CallBase *, 4> MallocCalls;
3595
  /// Collection of potentially removed free calls in a function.
3596
  SmallPtrSet<CallBase *, 4> PotentialRemovedFreeCalls;
3597
  /// The total amount of shared memory that has been used for HeapToShared.
3598
  unsigned SharedMemoryUsed = 0;
3599
};
3600

3601
struct AAKernelInfo : public StateWrapper<KernelInfoState, AbstractAttribute> {
3602
  using Base = StateWrapper<KernelInfoState, AbstractAttribute>;
3603
  AAKernelInfo(const IRPosition &IRP, Attributor &A) : Base(IRP) {}
3604

3605
  /// The callee value is tracked beyond a simple stripPointerCasts, so we allow
3606
  /// unknown callees.
3607
  static bool requiresCalleeForCallBase() { return false; }
3608

3609
  /// Statistics are tracked as part of manifest for now.
3610
  void trackStatistics() const override {}
3611

3612
  /// See AbstractAttribute::getAsStr()
3613
  const std::string getAsStr(Attributor *) const override {
3614
    if (!isValidState())
3615
      return "<invalid>";
3616
    return std::string(SPMDCompatibilityTracker.isAssumed() ? "SPMD"
3617
                                                            : "generic") +
3618
           std::string(SPMDCompatibilityTracker.isAtFixpoint() ? " [FIX]"
3619
                                                               : "") +
3620
           std::string(" #PRs: ") +
3621
           (ReachedKnownParallelRegions.isValidState()
3622
                ? std::to_string(ReachedKnownParallelRegions.size())
3623
                : "<invalid>") +
3624
           ", #Unknown PRs: " +
3625
           (ReachedUnknownParallelRegions.isValidState()
3626
                ? std::to_string(ReachedUnknownParallelRegions.size())
3627
                : "<invalid>") +
3628
           ", #Reaching Kernels: " +
3629
           (ReachingKernelEntries.isValidState()
3630
                ? std::to_string(ReachingKernelEntries.size())
3631
                : "<invalid>") +
3632
           ", #ParLevels: " +
3633
           (ParallelLevels.isValidState()
3634
                ? std::to_string(ParallelLevels.size())
3635
                : "<invalid>") +
3636
           ", NestedPar: " + (NestedParallelism ? "yes" : "no");
3637
  }
3638

3639
  /// Create an abstract attribute biew for the position \p IRP.
3640
  static AAKernelInfo &createForPosition(const IRPosition &IRP, Attributor &A);
3641

3642
  /// See AbstractAttribute::getName()
3643
  const std::string getName() const override { return "AAKernelInfo"; }
3644

3645
  /// See AbstractAttribute::getIdAddr()
3646
  const char *getIdAddr() const override { return &ID; }
3647

3648
  /// This function should return true if the type of the \p AA is AAKernelInfo
3649
  static bool classof(const AbstractAttribute *AA) {
3650
    return (AA->getIdAddr() == &ID);
3651
  }
3652

3653
  static const char ID;
3654
};
3655

3656
/// The function kernel info abstract attribute, basically, what can we say
3657
/// about a function with regards to the KernelInfoState.
3658
struct AAKernelInfoFunction : AAKernelInfo {
3659
  AAKernelInfoFunction(const IRPosition &IRP, Attributor &A)
3660
      : AAKernelInfo(IRP, A) {}
3661

3662
  SmallPtrSet<Instruction *, 4> GuardedInstructions;
3663

3664
  SmallPtrSetImpl<Instruction *> &getGuardedInstructions() {
3665
    return GuardedInstructions;
3666
  }
3667

3668
  void setConfigurationOfKernelEnvironment(ConstantStruct *ConfigC) {
3669
    Constant *NewKernelEnvC = ConstantFoldInsertValueInstruction(
3670
        KernelEnvC, ConfigC, {KernelInfo::ConfigurationIdx});
3671
    assert(NewKernelEnvC && "Failed to create new kernel environment");
3672
    KernelEnvC = cast<ConstantStruct>(NewKernelEnvC);
3673
  }
3674

3675
#define KERNEL_ENVIRONMENT_CONFIGURATION_SETTER(MEMBER)                        \
3676
  void set##MEMBER##OfKernelEnvironment(ConstantInt *NewVal) {                 \
3677
    ConstantStruct *ConfigC =                                                  \
3678
        KernelInfo::getConfigurationFromKernelEnvironment(KernelEnvC);         \
3679
    Constant *NewConfigC = ConstantFoldInsertValueInstruction(                 \
3680
        ConfigC, NewVal, {KernelInfo::MEMBER##Idx});                           \
3681
    assert(NewConfigC && "Failed to create new configuration environment");    \
3682
    setConfigurationOfKernelEnvironment(cast<ConstantStruct>(NewConfigC));     \
3683
  }
3684

3685
  KERNEL_ENVIRONMENT_CONFIGURATION_SETTER(UseGenericStateMachine)
3686
  KERNEL_ENVIRONMENT_CONFIGURATION_SETTER(MayUseNestedParallelism)
3687
  KERNEL_ENVIRONMENT_CONFIGURATION_SETTER(ExecMode)
3688
  KERNEL_ENVIRONMENT_CONFIGURATION_SETTER(MinThreads)
3689
  KERNEL_ENVIRONMENT_CONFIGURATION_SETTER(MaxThreads)
3690
  KERNEL_ENVIRONMENT_CONFIGURATION_SETTER(MinTeams)
3691
  KERNEL_ENVIRONMENT_CONFIGURATION_SETTER(MaxTeams)
3692

3693
#undef KERNEL_ENVIRONMENT_CONFIGURATION_SETTER
3694

3695
  /// See AbstractAttribute::initialize(...).
3696
  void initialize(Attributor &A) override {
3697
    // This is a high-level transform that might change the constant arguments
3698
    // of the init and dinit calls. We need to tell the Attributor about this
3699
    // to avoid other parts using the current constant value for simpliication.
3700
    auto &OMPInfoCache = static_cast<OMPInformationCache &>(A.getInfoCache());
3701

3702
    Function *Fn = getAnchorScope();
3703

3704
    OMPInformationCache::RuntimeFunctionInfo &InitRFI =
3705
        OMPInfoCache.RFIs[OMPRTL___kmpc_target_init];
3706
    OMPInformationCache::RuntimeFunctionInfo &DeinitRFI =
3707
        OMPInfoCache.RFIs[OMPRTL___kmpc_target_deinit];
3708

3709
    // For kernels we perform more initialization work, first we find the init
3710
    // and deinit calls.
3711
    auto StoreCallBase = [](Use &U,
3712
                            OMPInformationCache::RuntimeFunctionInfo &RFI,
3713
                            CallBase *&Storage) {
3714
      CallBase *CB = OpenMPOpt::getCallIfRegularCall(U, &RFI);
3715
      assert(CB &&
3716
             "Unexpected use of __kmpc_target_init or __kmpc_target_deinit!");
3717
      assert(!Storage &&
3718
             "Multiple uses of __kmpc_target_init or __kmpc_target_deinit!");
3719
      Storage = CB;
3720
      return false;
3721
    };
3722
    InitRFI.foreachUse(
3723
        [&](Use &U, Function &) {
3724
          StoreCallBase(U, InitRFI, KernelInitCB);
3725
          return false;
3726
        },
3727
        Fn);
3728
    DeinitRFI.foreachUse(
3729
        [&](Use &U, Function &) {
3730
          StoreCallBase(U, DeinitRFI, KernelDeinitCB);
3731
          return false;
3732
        },
3733
        Fn);
3734

3735
    // Ignore kernels without initializers such as global constructors.
3736
    if (!KernelInitCB || !KernelDeinitCB)
3737
      return;
3738

3739
    // Add itself to the reaching kernel and set IsKernelEntry.
3740
    ReachingKernelEntries.insert(Fn);
3741
    IsKernelEntry = true;
3742

3743
    KernelEnvC =
3744
        KernelInfo::getKernelEnvironementFromKernelInitCB(KernelInitCB);
3745
    GlobalVariable *KernelEnvGV =
3746
        KernelInfo::getKernelEnvironementGVFromKernelInitCB(KernelInitCB);
3747

3748
    Attributor::GlobalVariableSimplifictionCallbackTy
3749
        KernelConfigurationSimplifyCB =
3750
            [&](const GlobalVariable &GV, const AbstractAttribute *AA,
3751
                bool &UsedAssumedInformation) -> std::optional<Constant *> {
3752
      if (!isAtFixpoint()) {
3753
        if (!AA)
3754
          return nullptr;
3755
        UsedAssumedInformation = true;
3756
        A.recordDependence(*this, *AA, DepClassTy::OPTIONAL);
3757
      }
3758
      return KernelEnvC;
3759
    };
3760

3761
    A.registerGlobalVariableSimplificationCallback(
3762
        *KernelEnvGV, KernelConfigurationSimplifyCB);
3763

3764
    // Check if we know we are in SPMD-mode already.
3765
    ConstantInt *ExecModeC =
3766
        KernelInfo::getExecModeFromKernelEnvironment(KernelEnvC);
3767
    ConstantInt *AssumedExecModeC = ConstantInt::get(
3768
        ExecModeC->getIntegerType(),
3769
        ExecModeC->getSExtValue() | OMP_TGT_EXEC_MODE_GENERIC_SPMD);
3770
    if (ExecModeC->getSExtValue() & OMP_TGT_EXEC_MODE_SPMD)
3771
      SPMDCompatibilityTracker.indicateOptimisticFixpoint();
3772
    else if (DisableOpenMPOptSPMDization)
3773
      // This is a generic region but SPMDization is disabled so stop
3774
      // tracking.
3775
      SPMDCompatibilityTracker.indicatePessimisticFixpoint();
3776
    else
3777
      setExecModeOfKernelEnvironment(AssumedExecModeC);
3778

3779
    const Triple T(Fn->getParent()->getTargetTriple());
3780
    auto *Int32Ty = Type::getInt32Ty(Fn->getContext());
3781
    auto [MinThreads, MaxThreads] =
3782
        OpenMPIRBuilder::readThreadBoundsForKernel(T, *Fn);
3783
    if (MinThreads)
3784
      setMinThreadsOfKernelEnvironment(ConstantInt::get(Int32Ty, MinThreads));
3785
    if (MaxThreads)
3786
      setMaxThreadsOfKernelEnvironment(ConstantInt::get(Int32Ty, MaxThreads));
3787
    auto [MinTeams, MaxTeams] =
3788
        OpenMPIRBuilder::readTeamBoundsForKernel(T, *Fn);
3789
    if (MinTeams)
3790
      setMinTeamsOfKernelEnvironment(ConstantInt::get(Int32Ty, MinTeams));
3791
    if (MaxTeams)
3792
      setMaxTeamsOfKernelEnvironment(ConstantInt::get(Int32Ty, MaxTeams));
3793

3794
    ConstantInt *MayUseNestedParallelismC =
3795
        KernelInfo::getMayUseNestedParallelismFromKernelEnvironment(KernelEnvC);
3796
    ConstantInt *AssumedMayUseNestedParallelismC = ConstantInt::get(
3797
        MayUseNestedParallelismC->getIntegerType(), NestedParallelism);
3798
    setMayUseNestedParallelismOfKernelEnvironment(
3799
        AssumedMayUseNestedParallelismC);
3800

3801
    if (!DisableOpenMPOptStateMachineRewrite) {
3802
      ConstantInt *UseGenericStateMachineC =
3803
          KernelInfo::getUseGenericStateMachineFromKernelEnvironment(
3804
              KernelEnvC);
3805
      ConstantInt *AssumedUseGenericStateMachineC =
3806
          ConstantInt::get(UseGenericStateMachineC->getIntegerType(), false);
3807
      setUseGenericStateMachineOfKernelEnvironment(
3808
          AssumedUseGenericStateMachineC);
3809
    }
3810

3811
    // Register virtual uses of functions we might need to preserve.
3812
    auto RegisterVirtualUse = [&](RuntimeFunction RFKind,
3813
                                  Attributor::VirtualUseCallbackTy &CB) {
3814
      if (!OMPInfoCache.RFIs[RFKind].Declaration)
3815
        return;
3816
      A.registerVirtualUseCallback(*OMPInfoCache.RFIs[RFKind].Declaration, CB);
3817
    };
3818

3819
    // Add a dependence to ensure updates if the state changes.
3820
    auto AddDependence = [](Attributor &A, const AAKernelInfo *KI,
3821
                            const AbstractAttribute *QueryingAA) {
3822
      if (QueryingAA) {
3823
        A.recordDependence(*KI, *QueryingAA, DepClassTy::OPTIONAL);
3824
      }
3825
      return true;
3826
    };
3827

3828
    Attributor::VirtualUseCallbackTy CustomStateMachineUseCB =
3829
        [&](Attributor &A, const AbstractAttribute *QueryingAA) {
3830
          // Whenever we create a custom state machine we will insert calls to
3831
          // __kmpc_get_hardware_num_threads_in_block,
3832
          // __kmpc_get_warp_size,
3833
          // __kmpc_barrier_simple_generic,
3834
          // __kmpc_kernel_parallel, and
3835
          // __kmpc_kernel_end_parallel.
3836
          // Not needed if we are on track for SPMDzation.
3837
          if (SPMDCompatibilityTracker.isValidState())
3838
            return AddDependence(A, this, QueryingAA);
3839
          // Not needed if we can't rewrite due to an invalid state.
3840
          if (!ReachedKnownParallelRegions.isValidState())
3841
            return AddDependence(A, this, QueryingAA);
3842
          return false;
3843
        };
3844

3845
    // Not needed if we are pre-runtime merge.
3846
    if (!KernelInitCB->getCalledFunction()->isDeclaration()) {
3847
      RegisterVirtualUse(OMPRTL___kmpc_get_hardware_num_threads_in_block,
3848
                         CustomStateMachineUseCB);
3849
      RegisterVirtualUse(OMPRTL___kmpc_get_warp_size, CustomStateMachineUseCB);
3850
      RegisterVirtualUse(OMPRTL___kmpc_barrier_simple_generic,
3851
                         CustomStateMachineUseCB);
3852
      RegisterVirtualUse(OMPRTL___kmpc_kernel_parallel,
3853
                         CustomStateMachineUseCB);
3854
      RegisterVirtualUse(OMPRTL___kmpc_kernel_end_parallel,
3855
                         CustomStateMachineUseCB);
3856
    }
3857

3858
    // If we do not perform SPMDzation we do not need the virtual uses below.
3859
    if (SPMDCompatibilityTracker.isAtFixpoint())
3860
      return;
3861

3862
    Attributor::VirtualUseCallbackTy HWThreadIdUseCB =
3863
        [&](Attributor &A, const AbstractAttribute *QueryingAA) {
3864
          // Whenever we perform SPMDzation we will insert
3865
          // __kmpc_get_hardware_thread_id_in_block calls.
3866
          if (!SPMDCompatibilityTracker.isValidState())
3867
            return AddDependence(A, this, QueryingAA);
3868
          return false;
3869
        };
3870
    RegisterVirtualUse(OMPRTL___kmpc_get_hardware_thread_id_in_block,
3871
                       HWThreadIdUseCB);
3872

3873
    Attributor::VirtualUseCallbackTy SPMDBarrierUseCB =
3874
        [&](Attributor &A, const AbstractAttribute *QueryingAA) {
3875
          // Whenever we perform SPMDzation with guarding we will insert
3876
          // __kmpc_simple_barrier_spmd calls. If SPMDzation failed, there is
3877
          // nothing to guard, or there are no parallel regions, we don't need
3878
          // the calls.
3879
          if (!SPMDCompatibilityTracker.isValidState())
3880
            return AddDependence(A, this, QueryingAA);
3881
          if (SPMDCompatibilityTracker.empty())
3882
            return AddDependence(A, this, QueryingAA);
3883
          if (!mayContainParallelRegion())
3884
            return AddDependence(A, this, QueryingAA);
3885
          return false;
3886
        };
3887
    RegisterVirtualUse(OMPRTL___kmpc_barrier_simple_spmd, SPMDBarrierUseCB);
3888
  }
3889

3890
  /// Sanitize the string \p S such that it is a suitable global symbol name.
3891
  static std::string sanitizeForGlobalName(std::string S) {
3892
    std::replace_if(
3893
        S.begin(), S.end(),
3894
        [](const char C) {
3895
          return !((C >= 'a' && C <= 'z') || (C >= 'A' && C <= 'Z') ||
3896
                   (C >= '0' && C <= '9') || C == '_');
3897
        },
3898
        '.');
3899
    return S;
3900
  }
3901

3902
  /// Modify the IR based on the KernelInfoState as the fixpoint iteration is
3903
  /// finished now.
3904
  ChangeStatus manifest(Attributor &A) override {
3905
    // If we are not looking at a kernel with __kmpc_target_init and
3906
    // __kmpc_target_deinit call we cannot actually manifest the information.
3907
    if (!KernelInitCB || !KernelDeinitCB)
3908
      return ChangeStatus::UNCHANGED;
3909

3910
    ChangeStatus Changed = ChangeStatus::UNCHANGED;
3911

3912
    bool HasBuiltStateMachine = true;
3913
    if (!changeToSPMDMode(A, Changed)) {
3914
      if (!KernelInitCB->getCalledFunction()->isDeclaration())
3915
        HasBuiltStateMachine = buildCustomStateMachine(A, Changed);
3916
      else
3917
        HasBuiltStateMachine = false;
3918
    }
3919

3920
    // We need to reset KernelEnvC if specific rewriting is not done.
3921
    ConstantStruct *ExistingKernelEnvC =
3922
        KernelInfo::getKernelEnvironementFromKernelInitCB(KernelInitCB);
3923
    ConstantInt *OldUseGenericStateMachineVal =
3924
        KernelInfo::getUseGenericStateMachineFromKernelEnvironment(
3925
            ExistingKernelEnvC);
3926
    if (!HasBuiltStateMachine)
3927
      setUseGenericStateMachineOfKernelEnvironment(
3928
          OldUseGenericStateMachineVal);
3929

3930
    // At last, update the KernelEnvc
3931
    GlobalVariable *KernelEnvGV =
3932
        KernelInfo::getKernelEnvironementGVFromKernelInitCB(KernelInitCB);
3933
    if (KernelEnvGV->getInitializer() != KernelEnvC) {
3934
      KernelEnvGV->setInitializer(KernelEnvC);
3935
      Changed = ChangeStatus::CHANGED;
3936
    }
3937

3938
    return Changed;
3939
  }
3940

3941
  void insertInstructionGuardsHelper(Attributor &A) {
3942
    auto &OMPInfoCache = static_cast<OMPInformationCache &>(A.getInfoCache());
3943

3944
    auto CreateGuardedRegion = [&](Instruction *RegionStartI,
3945
                                   Instruction *RegionEndI) {
3946
      LoopInfo *LI = nullptr;
3947
      DominatorTree *DT = nullptr;
3948
      MemorySSAUpdater *MSU = nullptr;
3949
      using InsertPointTy = OpenMPIRBuilder::InsertPointTy;
3950

3951
      BasicBlock *ParentBB = RegionStartI->getParent();
3952
      Function *Fn = ParentBB->getParent();
3953
      Module &M = *Fn->getParent();
3954

3955
      // Create all the blocks and logic.
3956
      // ParentBB:
3957
      //    goto RegionCheckTidBB
3958
      // RegionCheckTidBB:
3959
      //    Tid = __kmpc_hardware_thread_id()
3960
      //    if (Tid != 0)
3961
      //        goto RegionBarrierBB
3962
      // RegionStartBB:
3963
      //    <execute instructions guarded>
3964
      //    goto RegionEndBB
3965
      // RegionEndBB:
3966
      //    <store escaping values to shared mem>
3967
      //    goto RegionBarrierBB
3968
      //  RegionBarrierBB:
3969
      //    __kmpc_simple_barrier_spmd()
3970
      //    // second barrier is omitted if lacking escaping values.
3971
      //    <load escaping values from shared mem>
3972
      //    __kmpc_simple_barrier_spmd()
3973
      //    goto RegionExitBB
3974
      // RegionExitBB:
3975
      //    <execute rest of instructions>
3976

3977
      BasicBlock *RegionEndBB = SplitBlock(ParentBB, RegionEndI->getNextNode(),
3978
                                           DT, LI, MSU, "region.guarded.end");
3979
      BasicBlock *RegionBarrierBB =
3980
          SplitBlock(RegionEndBB, &*RegionEndBB->getFirstInsertionPt(), DT, LI,
3981
                     MSU, "region.barrier");
3982
      BasicBlock *RegionExitBB =
3983
          SplitBlock(RegionBarrierBB, &*RegionBarrierBB->getFirstInsertionPt(),
3984
                     DT, LI, MSU, "region.exit");
3985
      BasicBlock *RegionStartBB =
3986
          SplitBlock(ParentBB, RegionStartI, DT, LI, MSU, "region.guarded");
3987

3988
      assert(ParentBB->getUniqueSuccessor() == RegionStartBB &&
3989
             "Expected a different CFG");
3990

3991
      BasicBlock *RegionCheckTidBB = SplitBlock(
3992
          ParentBB, ParentBB->getTerminator(), DT, LI, MSU, "region.check.tid");
3993

3994
      // Register basic blocks with the Attributor.
3995
      A.registerManifestAddedBasicBlock(*RegionEndBB);
3996
      A.registerManifestAddedBasicBlock(*RegionBarrierBB);
3997
      A.registerManifestAddedBasicBlock(*RegionExitBB);
3998
      A.registerManifestAddedBasicBlock(*RegionStartBB);
3999
      A.registerManifestAddedBasicBlock(*RegionCheckTidBB);
4000

4001
      bool HasBroadcastValues = false;
4002
      // Find escaping outputs from the guarded region to outside users and
4003
      // broadcast their values to them.
4004
      for (Instruction &I : *RegionStartBB) {
4005
        SmallVector<Use *, 4> OutsideUses;
4006
        for (Use &U : I.uses()) {
4007
          Instruction &UsrI = *cast<Instruction>(U.getUser());
4008
          if (UsrI.getParent() != RegionStartBB)
4009
            OutsideUses.push_back(&U);
4010
        }
4011

4012
        if (OutsideUses.empty())
4013
          continue;
4014

4015
        HasBroadcastValues = true;
4016

4017
        // Emit a global variable in shared memory to store the broadcasted
4018
        // value.
4019
        auto *SharedMem = new GlobalVariable(
4020
            M, I.getType(), /* IsConstant */ false,
4021
            GlobalValue::InternalLinkage, UndefValue::get(I.getType()),
4022
            sanitizeForGlobalName(
4023
                (I.getName() + ".guarded.output.alloc").str()),
4024
            nullptr, GlobalValue::NotThreadLocal,
4025
            static_cast<unsigned>(AddressSpace::Shared));
4026

4027
        // Emit a store instruction to update the value.
4028
        new StoreInst(&I, SharedMem,
4029
                      RegionEndBB->getTerminator()->getIterator());
4030

4031
        LoadInst *LoadI = new LoadInst(
4032
            I.getType(), SharedMem, I.getName() + ".guarded.output.load",
4033
            RegionBarrierBB->getTerminator()->getIterator());
4034

4035
        // Emit a load instruction and replace uses of the output value.
4036
        for (Use *U : OutsideUses)
4037
          A.changeUseAfterManifest(*U, *LoadI);
4038
      }
4039

4040
      auto &OMPInfoCache = static_cast<OMPInformationCache &>(A.getInfoCache());
4041

4042
      // Go to tid check BB in ParentBB.
4043
      const DebugLoc DL = ParentBB->getTerminator()->getDebugLoc();
4044
      ParentBB->getTerminator()->eraseFromParent();
4045
      OpenMPIRBuilder::LocationDescription Loc(
4046
          InsertPointTy(ParentBB, ParentBB->end()), DL);
4047
      OMPInfoCache.OMPBuilder.updateToLocation(Loc);
4048
      uint32_t SrcLocStrSize;
4049
      auto *SrcLocStr =
4050
          OMPInfoCache.OMPBuilder.getOrCreateSrcLocStr(Loc, SrcLocStrSize);
4051
      Value *Ident =
4052
          OMPInfoCache.OMPBuilder.getOrCreateIdent(SrcLocStr, SrcLocStrSize);
4053
      BranchInst::Create(RegionCheckTidBB, ParentBB)->setDebugLoc(DL);
4054

4055
      // Add check for Tid in RegionCheckTidBB
4056
      RegionCheckTidBB->getTerminator()->eraseFromParent();
4057
      OpenMPIRBuilder::LocationDescription LocRegionCheckTid(
4058
          InsertPointTy(RegionCheckTidBB, RegionCheckTidBB->end()), DL);
4059
      OMPInfoCache.OMPBuilder.updateToLocation(LocRegionCheckTid);
4060
      FunctionCallee HardwareTidFn =
4061
          OMPInfoCache.OMPBuilder.getOrCreateRuntimeFunction(
4062
              M, OMPRTL___kmpc_get_hardware_thread_id_in_block);
4063
      CallInst *Tid =
4064
          OMPInfoCache.OMPBuilder.Builder.CreateCall(HardwareTidFn, {});
4065
      Tid->setDebugLoc(DL);
4066
      OMPInfoCache.setCallingConvention(HardwareTidFn, Tid);
4067
      Value *TidCheck = OMPInfoCache.OMPBuilder.Builder.CreateIsNull(Tid);
4068
      OMPInfoCache.OMPBuilder.Builder
4069
          .CreateCondBr(TidCheck, RegionStartBB, RegionBarrierBB)
4070
          ->setDebugLoc(DL);
4071

4072
      // First barrier for synchronization, ensures main thread has updated
4073
      // values.
4074
      FunctionCallee BarrierFn =
4075
          OMPInfoCache.OMPBuilder.getOrCreateRuntimeFunction(
4076
              M, OMPRTL___kmpc_barrier_simple_spmd);
4077
      OMPInfoCache.OMPBuilder.updateToLocation(InsertPointTy(
4078
          RegionBarrierBB, RegionBarrierBB->getFirstInsertionPt()));
4079
      CallInst *Barrier =
4080
          OMPInfoCache.OMPBuilder.Builder.CreateCall(BarrierFn, {Ident, Tid});
4081
      Barrier->setDebugLoc(DL);
4082
      OMPInfoCache.setCallingConvention(BarrierFn, Barrier);
4083

4084
      // Second barrier ensures workers have read broadcast values.
4085
      if (HasBroadcastValues) {
4086
        CallInst *Barrier =
4087
            CallInst::Create(BarrierFn, {Ident, Tid}, "",
4088
                             RegionBarrierBB->getTerminator()->getIterator());
4089
        Barrier->setDebugLoc(DL);
4090
        OMPInfoCache.setCallingConvention(BarrierFn, Barrier);
4091
      }
4092
    };
4093

4094
    auto &AllocSharedRFI = OMPInfoCache.RFIs[OMPRTL___kmpc_alloc_shared];
4095
    SmallPtrSet<BasicBlock *, 8> Visited;
4096
    for (Instruction *GuardedI : SPMDCompatibilityTracker) {
4097
      BasicBlock *BB = GuardedI->getParent();
4098
      if (!Visited.insert(BB).second)
4099
        continue;
4100

4101
      SmallVector<std::pair<Instruction *, Instruction *>> Reorders;
4102
      Instruction *LastEffect = nullptr;
4103
      BasicBlock::reverse_iterator IP = BB->rbegin(), IPEnd = BB->rend();
4104
      while (++IP != IPEnd) {
4105
        if (!IP->mayHaveSideEffects() && !IP->mayReadFromMemory())
4106
          continue;
4107
        Instruction *I = &*IP;
4108
        if (OpenMPOpt::getCallIfRegularCall(*I, &AllocSharedRFI))
4109
          continue;
4110
        if (!I->user_empty() || !SPMDCompatibilityTracker.contains(I)) {
4111
          LastEffect = nullptr;
4112
          continue;
4113
        }
4114
        if (LastEffect)
4115
          Reorders.push_back({I, LastEffect});
4116
        LastEffect = &*IP;
4117
      }
4118
      for (auto &Reorder : Reorders)
4119
        Reorder.first->moveBefore(Reorder.second);
4120
    }
4121

4122
    SmallVector<std::pair<Instruction *, Instruction *>, 4> GuardedRegions;
4123

4124
    for (Instruction *GuardedI : SPMDCompatibilityTracker) {
4125
      BasicBlock *BB = GuardedI->getParent();
4126
      auto *CalleeAA = A.lookupAAFor<AAKernelInfo>(
4127
          IRPosition::function(*GuardedI->getFunction()), nullptr,
4128
          DepClassTy::NONE);
4129
      assert(CalleeAA != nullptr && "Expected Callee AAKernelInfo");
4130
      auto &CalleeAAFunction = *cast<AAKernelInfoFunction>(CalleeAA);
4131
      // Continue if instruction is already guarded.
4132
      if (CalleeAAFunction.getGuardedInstructions().contains(GuardedI))
4133
        continue;
4134

4135
      Instruction *GuardedRegionStart = nullptr, *GuardedRegionEnd = nullptr;
4136
      for (Instruction &I : *BB) {
4137
        // If instruction I needs to be guarded update the guarded region
4138
        // bounds.
4139
        if (SPMDCompatibilityTracker.contains(&I)) {
4140
          CalleeAAFunction.getGuardedInstructions().insert(&I);
4141
          if (GuardedRegionStart)
4142
            GuardedRegionEnd = &I;
4143
          else
4144
            GuardedRegionStart = GuardedRegionEnd = &I;
4145

4146
          continue;
4147
        }
4148

4149
        // Instruction I does not need guarding, store
4150
        // any region found and reset bounds.
4151
        if (GuardedRegionStart) {
4152
          GuardedRegions.push_back(
4153
              std::make_pair(GuardedRegionStart, GuardedRegionEnd));
4154
          GuardedRegionStart = nullptr;
4155
          GuardedRegionEnd = nullptr;
4156
        }
4157
      }
4158
    }
4159

4160
    for (auto &GR : GuardedRegions)
4161
      CreateGuardedRegion(GR.first, GR.second);
4162
  }
4163

4164
  void forceSingleThreadPerWorkgroupHelper(Attributor &A) {
4165
    // Only allow 1 thread per workgroup to continue executing the user code.
4166
    //
4167
    //     InitCB = __kmpc_target_init(...)
4168
    //     ThreadIdInBlock = __kmpc_get_hardware_thread_id_in_block();
4169
    //     if (ThreadIdInBlock != 0) return;
4170
    // UserCode:
4171
    //     // user code
4172
    //
4173
    auto &Ctx = getAnchorValue().getContext();
4174
    Function *Kernel = getAssociatedFunction();
4175
    assert(Kernel && "Expected an associated function!");
4176

4177
    // Create block for user code to branch to from initial block.
4178
    BasicBlock *InitBB = KernelInitCB->getParent();
4179
    BasicBlock *UserCodeBB = InitBB->splitBasicBlock(
4180
        KernelInitCB->getNextNode(), "main.thread.user_code");
4181
    BasicBlock *ReturnBB =
4182
        BasicBlock::Create(Ctx, "exit.threads", Kernel, UserCodeBB);
4183

4184
    // Register blocks with attributor:
4185
    A.registerManifestAddedBasicBlock(*InitBB);
4186
    A.registerManifestAddedBasicBlock(*UserCodeBB);
4187
    A.registerManifestAddedBasicBlock(*ReturnBB);
4188

4189
    // Debug location:
4190
    const DebugLoc &DLoc = KernelInitCB->getDebugLoc();
4191
    ReturnInst::Create(Ctx, ReturnBB)->setDebugLoc(DLoc);
4192
    InitBB->getTerminator()->eraseFromParent();
4193

4194
    // Prepare call to OMPRTL___kmpc_get_hardware_thread_id_in_block.
4195
    Module &M = *Kernel->getParent();
4196
    auto &OMPInfoCache = static_cast<OMPInformationCache &>(A.getInfoCache());
4197
    FunctionCallee ThreadIdInBlockFn =
4198
        OMPInfoCache.OMPBuilder.getOrCreateRuntimeFunction(
4199
            M, OMPRTL___kmpc_get_hardware_thread_id_in_block);
4200

4201
    // Get thread ID in block.
4202
    CallInst *ThreadIdInBlock =
4203
        CallInst::Create(ThreadIdInBlockFn, "thread_id.in.block", InitBB);
4204
    OMPInfoCache.setCallingConvention(ThreadIdInBlockFn, ThreadIdInBlock);
4205
    ThreadIdInBlock->setDebugLoc(DLoc);
4206

4207
    // Eliminate all threads in the block with ID not equal to 0:
4208
    Instruction *IsMainThread =
4209
        ICmpInst::Create(ICmpInst::ICmp, CmpInst::ICMP_NE, ThreadIdInBlock,
4210
                         ConstantInt::get(ThreadIdInBlock->getType(), 0),
4211
                         "thread.is_main", InitBB);
4212
    IsMainThread->setDebugLoc(DLoc);
4213
    BranchInst::Create(ReturnBB, UserCodeBB, IsMainThread, InitBB);
4214
  }
4215

4216
  bool changeToSPMDMode(Attributor &A, ChangeStatus &Changed) {
4217
    auto &OMPInfoCache = static_cast<OMPInformationCache &>(A.getInfoCache());
4218

4219
    // We cannot change to SPMD mode if the runtime functions aren't availible.
4220
    if (!OMPInfoCache.runtimeFnsAvailable(
4221
            {OMPRTL___kmpc_get_hardware_thread_id_in_block,
4222
             OMPRTL___kmpc_barrier_simple_spmd}))
4223
      return false;
4224

4225
    if (!SPMDCompatibilityTracker.isAssumed()) {
4226
      for (Instruction *NonCompatibleI : SPMDCompatibilityTracker) {
4227
        if (!NonCompatibleI)
4228
          continue;
4229

4230
        // Skip diagnostics on calls to known OpenMP runtime functions for now.
4231
        if (auto *CB = dyn_cast<CallBase>(NonCompatibleI))
4232
          if (OMPInfoCache.RTLFunctions.contains(CB->getCalledFunction()))
4233
            continue;
4234

4235
        auto Remark = [&](OptimizationRemarkAnalysis ORA) {
4236
          ORA << "Value has potential side effects preventing SPMD-mode "
4237
                 "execution";
4238
          if (isa<CallBase>(NonCompatibleI)) {
4239
            ORA << ". Add `[[omp::assume(\"ompx_spmd_amenable\")]]` to "
4240
                   "the called function to override";
4241
          }
4242
          return ORA << ".";
4243
        };
4244
        A.emitRemark<OptimizationRemarkAnalysis>(NonCompatibleI, "OMP121",
4245
                                                 Remark);
4246

4247
        LLVM_DEBUG(dbgs() << TAG << "SPMD-incompatible side-effect: "
4248
                          << *NonCompatibleI << "\n");
4249
      }
4250

4251
      return false;
4252
    }
4253

4254
    // Get the actual kernel, could be the caller of the anchor scope if we have
4255
    // a debug wrapper.
4256
    Function *Kernel = getAnchorScope();
4257
    if (Kernel->hasLocalLinkage()) {
4258
      assert(Kernel->hasOneUse() && "Unexpected use of debug kernel wrapper.");
4259
      auto *CB = cast<CallBase>(Kernel->user_back());
4260
      Kernel = CB->getCaller();
4261
    }
4262
    assert(omp::isOpenMPKernel(*Kernel) && "Expected kernel function!");
4263

4264
    // Check if the kernel is already in SPMD mode, if so, return success.
4265
    ConstantStruct *ExistingKernelEnvC =
4266
        KernelInfo::getKernelEnvironementFromKernelInitCB(KernelInitCB);
4267
    auto *ExecModeC =
4268
        KernelInfo::getExecModeFromKernelEnvironment(ExistingKernelEnvC);
4269
    const int8_t ExecModeVal = ExecModeC->getSExtValue();
4270
    if (ExecModeVal != OMP_TGT_EXEC_MODE_GENERIC)
4271
      return true;
4272

4273
    // We will now unconditionally modify the IR, indicate a change.
4274
    Changed = ChangeStatus::CHANGED;
4275

4276
    // Do not use instruction guards when no parallel is present inside
4277
    // the target region.
4278
    if (mayContainParallelRegion())
4279
      insertInstructionGuardsHelper(A);
4280
    else
4281
      forceSingleThreadPerWorkgroupHelper(A);
4282

4283
    // Adjust the global exec mode flag that tells the runtime what mode this
4284
    // kernel is executed in.
4285
    assert(ExecModeVal == OMP_TGT_EXEC_MODE_GENERIC &&
4286
           "Initially non-SPMD kernel has SPMD exec mode!");
4287
    setExecModeOfKernelEnvironment(
4288
        ConstantInt::get(ExecModeC->getIntegerType(),
4289
                         ExecModeVal | OMP_TGT_EXEC_MODE_GENERIC_SPMD));
4290

4291
    ++NumOpenMPTargetRegionKernelsSPMD;
4292

4293
    auto Remark = [&](OptimizationRemark OR) {
4294
      return OR << "Transformed generic-mode kernel to SPMD-mode.";
4295
    };
4296
    A.emitRemark<OptimizationRemark>(KernelInitCB, "OMP120", Remark);
4297
    return true;
4298
  };
4299

4300
  bool buildCustomStateMachine(Attributor &A, ChangeStatus &Changed) {
4301
    // If we have disabled state machine rewrites, don't make a custom one
4302
    if (DisableOpenMPOptStateMachineRewrite)
4303
      return false;
4304

4305
    // Don't rewrite the state machine if we are not in a valid state.
4306
    if (!ReachedKnownParallelRegions.isValidState())
4307
      return false;
4308

4309
    auto &OMPInfoCache = static_cast<OMPInformationCache &>(A.getInfoCache());
4310
    if (!OMPInfoCache.runtimeFnsAvailable(
4311
            {OMPRTL___kmpc_get_hardware_num_threads_in_block,
4312
             OMPRTL___kmpc_get_warp_size, OMPRTL___kmpc_barrier_simple_generic,
4313
             OMPRTL___kmpc_kernel_parallel, OMPRTL___kmpc_kernel_end_parallel}))
4314
      return false;
4315

4316
    ConstantStruct *ExistingKernelEnvC =
4317
        KernelInfo::getKernelEnvironementFromKernelInitCB(KernelInitCB);
4318

4319
    // Check if the current configuration is non-SPMD and generic state machine.
4320
    // If we already have SPMD mode or a custom state machine we do not need to
4321
    // go any further. If it is anything but a constant something is weird and
4322
    // we give up.
4323
    ConstantInt *UseStateMachineC =
4324
        KernelInfo::getUseGenericStateMachineFromKernelEnvironment(
4325
            ExistingKernelEnvC);
4326
    ConstantInt *ModeC =
4327
        KernelInfo::getExecModeFromKernelEnvironment(ExistingKernelEnvC);
4328

4329
    // If we are stuck with generic mode, try to create a custom device (=GPU)
4330
    // state machine which is specialized for the parallel regions that are
4331
    // reachable by the kernel.
4332
    if (UseStateMachineC->isZero() ||
4333
        (ModeC->getSExtValue() & OMP_TGT_EXEC_MODE_SPMD))
4334
      return false;
4335

4336
    Changed = ChangeStatus::CHANGED;
4337

4338
    // If not SPMD mode, indicate we use a custom state machine now.
4339
    setUseGenericStateMachineOfKernelEnvironment(
4340
        ConstantInt::get(UseStateMachineC->getIntegerType(), false));
4341

4342
    // If we don't actually need a state machine we are done here. This can
4343
    // happen if there simply are no parallel regions. In the resulting kernel
4344
    // all worker threads will simply exit right away, leaving the main thread
4345
    // to do the work alone.
4346
    if (!mayContainParallelRegion()) {
4347
      ++NumOpenMPTargetRegionKernelsWithoutStateMachine;
4348

4349
      auto Remark = [&](OptimizationRemark OR) {
4350
        return OR << "Removing unused state machine from generic-mode kernel.";
4351
      };
4352
      A.emitRemark<OptimizationRemark>(KernelInitCB, "OMP130", Remark);
4353

4354
      return true;
4355
    }
4356

4357
    // Keep track in the statistics of our new shiny custom state machine.
4358
    if (ReachedUnknownParallelRegions.empty()) {
4359
      ++NumOpenMPTargetRegionKernelsCustomStateMachineWithoutFallback;
4360

4361
      auto Remark = [&](OptimizationRemark OR) {
4362
        return OR << "Rewriting generic-mode kernel with a customized state "
4363
                     "machine.";
4364
      };
4365
      A.emitRemark<OptimizationRemark>(KernelInitCB, "OMP131", Remark);
4366
    } else {
4367
      ++NumOpenMPTargetRegionKernelsCustomStateMachineWithFallback;
4368

4369
      auto Remark = [&](OptimizationRemarkAnalysis OR) {
4370
        return OR << "Generic-mode kernel is executed with a customized state "
4371
                     "machine that requires a fallback.";
4372
      };
4373
      A.emitRemark<OptimizationRemarkAnalysis>(KernelInitCB, "OMP132", Remark);
4374

4375
      // Tell the user why we ended up with a fallback.
4376
      for (CallBase *UnknownParallelRegionCB : ReachedUnknownParallelRegions) {
4377
        if (!UnknownParallelRegionCB)
4378
          continue;
4379
        auto Remark = [&](OptimizationRemarkAnalysis ORA) {
4380
          return ORA << "Call may contain unknown parallel regions. Use "
4381
                     << "`[[omp::assume(\"omp_no_parallelism\")]]` to "
4382
                        "override.";
4383
        };
4384
        A.emitRemark<OptimizationRemarkAnalysis>(UnknownParallelRegionCB,
4385
                                                 "OMP133", Remark);
4386
      }
4387
    }
4388

4389
    // Create all the blocks:
4390
    //
4391
    //                       InitCB = __kmpc_target_init(...)
4392
    //                       BlockHwSize =
4393
    //                         __kmpc_get_hardware_num_threads_in_block();
4394
    //                       WarpSize = __kmpc_get_warp_size();
4395
    //                       BlockSize = BlockHwSize - WarpSize;
4396
    // IsWorkerCheckBB:      bool IsWorker = InitCB != -1;
4397
    //                       if (IsWorker) {
4398
    //                         if (InitCB >= BlockSize) return;
4399
    // SMBeginBB:               __kmpc_barrier_simple_generic(...);
4400
    //                         void *WorkFn;
4401
    //                         bool Active = __kmpc_kernel_parallel(&WorkFn);
4402
    //                         if (!WorkFn) return;
4403
    // SMIsActiveCheckBB:       if (Active) {
4404
    // SMIfCascadeCurrentBB:      if      (WorkFn == <ParFn0>)
4405
    //                              ParFn0(...);
4406
    // SMIfCascadeCurrentBB:      else if (WorkFn == <ParFn1>)
4407
    //                              ParFn1(...);
4408
    //                            ...
4409
    // SMIfCascadeCurrentBB:      else
4410
    //                              ((WorkFnTy*)WorkFn)(...);
4411
    // SMEndParallelBB:           __kmpc_kernel_end_parallel(...);
4412
    //                          }
4413
    // SMDoneBB:                __kmpc_barrier_simple_generic(...);
4414
    //                          goto SMBeginBB;
4415
    //                       }
4416
    // UserCodeEntryBB:      // user code
4417
    //                       __kmpc_target_deinit(...)
4418
    //
4419
    auto &Ctx = getAnchorValue().getContext();
4420
    Function *Kernel = getAssociatedFunction();
4421
    assert(Kernel && "Expected an associated function!");
4422

4423
    BasicBlock *InitBB = KernelInitCB->getParent();
4424
    BasicBlock *UserCodeEntryBB = InitBB->splitBasicBlock(
4425
        KernelInitCB->getNextNode(), "thread.user_code.check");
4426
    BasicBlock *IsWorkerCheckBB =
4427
        BasicBlock::Create(Ctx, "is_worker_check", Kernel, UserCodeEntryBB);
4428
    BasicBlock *StateMachineBeginBB = BasicBlock::Create(
4429
        Ctx, "worker_state_machine.begin", Kernel, UserCodeEntryBB);
4430
    BasicBlock *StateMachineFinishedBB = BasicBlock::Create(
4431
        Ctx, "worker_state_machine.finished", Kernel, UserCodeEntryBB);
4432
    BasicBlock *StateMachineIsActiveCheckBB = BasicBlock::Create(
4433
        Ctx, "worker_state_machine.is_active.check", Kernel, UserCodeEntryBB);
4434
    BasicBlock *StateMachineIfCascadeCurrentBB =
4435
        BasicBlock::Create(Ctx, "worker_state_machine.parallel_region.check",
4436
                           Kernel, UserCodeEntryBB);
4437
    BasicBlock *StateMachineEndParallelBB =
4438
        BasicBlock::Create(Ctx, "worker_state_machine.parallel_region.end",
4439
                           Kernel, UserCodeEntryBB);
4440
    BasicBlock *StateMachineDoneBarrierBB = BasicBlock::Create(
4441
        Ctx, "worker_state_machine.done.barrier", Kernel, UserCodeEntryBB);
4442
    A.registerManifestAddedBasicBlock(*InitBB);
4443
    A.registerManifestAddedBasicBlock(*UserCodeEntryBB);
4444
    A.registerManifestAddedBasicBlock(*IsWorkerCheckBB);
4445
    A.registerManifestAddedBasicBlock(*StateMachineBeginBB);
4446
    A.registerManifestAddedBasicBlock(*StateMachineFinishedBB);
4447
    A.registerManifestAddedBasicBlock(*StateMachineIsActiveCheckBB);
4448
    A.registerManifestAddedBasicBlock(*StateMachineIfCascadeCurrentBB);
4449
    A.registerManifestAddedBasicBlock(*StateMachineEndParallelBB);
4450
    A.registerManifestAddedBasicBlock(*StateMachineDoneBarrierBB);
4451

4452
    const DebugLoc &DLoc = KernelInitCB->getDebugLoc();
4453
    ReturnInst::Create(Ctx, StateMachineFinishedBB)->setDebugLoc(DLoc);
4454
    InitBB->getTerminator()->eraseFromParent();
4455

4456
    Instruction *IsWorker =
4457
        ICmpInst::Create(ICmpInst::ICmp, llvm::CmpInst::ICMP_NE, KernelInitCB,
4458
                         ConstantInt::get(KernelInitCB->getType(), -1),
4459
                         "thread.is_worker", InitBB);
4460
    IsWorker->setDebugLoc(DLoc);
4461
    BranchInst::Create(IsWorkerCheckBB, UserCodeEntryBB, IsWorker, InitBB);
4462

4463
    Module &M = *Kernel->getParent();
4464
    FunctionCallee BlockHwSizeFn =
4465
        OMPInfoCache.OMPBuilder.getOrCreateRuntimeFunction(
4466
            M, OMPRTL___kmpc_get_hardware_num_threads_in_block);
4467
    FunctionCallee WarpSizeFn =
4468
        OMPInfoCache.OMPBuilder.getOrCreateRuntimeFunction(
4469
            M, OMPRTL___kmpc_get_warp_size);
4470
    CallInst *BlockHwSize =
4471
        CallInst::Create(BlockHwSizeFn, "block.hw_size", IsWorkerCheckBB);
4472
    OMPInfoCache.setCallingConvention(BlockHwSizeFn, BlockHwSize);
4473
    BlockHwSize->setDebugLoc(DLoc);
4474
    CallInst *WarpSize =
4475
        CallInst::Create(WarpSizeFn, "warp.size", IsWorkerCheckBB);
4476
    OMPInfoCache.setCallingConvention(WarpSizeFn, WarpSize);
4477
    WarpSize->setDebugLoc(DLoc);
4478
    Instruction *BlockSize = BinaryOperator::CreateSub(
4479
        BlockHwSize, WarpSize, "block.size", IsWorkerCheckBB);
4480
    BlockSize->setDebugLoc(DLoc);
4481
    Instruction *IsMainOrWorker = ICmpInst::Create(
4482
        ICmpInst::ICmp, llvm::CmpInst::ICMP_SLT, KernelInitCB, BlockSize,
4483
        "thread.is_main_or_worker", IsWorkerCheckBB);
4484
    IsMainOrWorker->setDebugLoc(DLoc);
4485
    BranchInst::Create(StateMachineBeginBB, StateMachineFinishedBB,
4486
                       IsMainOrWorker, IsWorkerCheckBB);
4487

4488
    // Create local storage for the work function pointer.
4489
    const DataLayout &DL = M.getDataLayout();
4490
    Type *VoidPtrTy = PointerType::getUnqual(Ctx);
4491
    Instruction *WorkFnAI =
4492
        new AllocaInst(VoidPtrTy, DL.getAllocaAddrSpace(), nullptr,
4493
                       "worker.work_fn.addr", Kernel->getEntryBlock().begin());
4494
    WorkFnAI->setDebugLoc(DLoc);
4495

4496
    OMPInfoCache.OMPBuilder.updateToLocation(
4497
        OpenMPIRBuilder::LocationDescription(
4498
            IRBuilder<>::InsertPoint(StateMachineBeginBB,
4499
                                     StateMachineBeginBB->end()),
4500
            DLoc));
4501

4502
    Value *Ident = KernelInfo::getIdentFromKernelEnvironment(KernelEnvC);
4503
    Value *GTid = KernelInitCB;
4504

4505
    FunctionCallee BarrierFn =
4506
        OMPInfoCache.OMPBuilder.getOrCreateRuntimeFunction(
4507
            M, OMPRTL___kmpc_barrier_simple_generic);
4508
    CallInst *Barrier =
4509
        CallInst::Create(BarrierFn, {Ident, GTid}, "", StateMachineBeginBB);
4510
    OMPInfoCache.setCallingConvention(BarrierFn, Barrier);
4511
    Barrier->setDebugLoc(DLoc);
4512

4513
    if (WorkFnAI->getType()->getPointerAddressSpace() !=
4514
        (unsigned int)AddressSpace::Generic) {
4515
      WorkFnAI = new AddrSpaceCastInst(
4516
          WorkFnAI, PointerType::get(Ctx, (unsigned int)AddressSpace::Generic),
4517
          WorkFnAI->getName() + ".generic", StateMachineBeginBB);
4518
      WorkFnAI->setDebugLoc(DLoc);
4519
    }
4520

4521
    FunctionCallee KernelParallelFn =
4522
        OMPInfoCache.OMPBuilder.getOrCreateRuntimeFunction(
4523
            M, OMPRTL___kmpc_kernel_parallel);
4524
    CallInst *IsActiveWorker = CallInst::Create(
4525
        KernelParallelFn, {WorkFnAI}, "worker.is_active", StateMachineBeginBB);
4526
    OMPInfoCache.setCallingConvention(KernelParallelFn, IsActiveWorker);
4527
    IsActiveWorker->setDebugLoc(DLoc);
4528
    Instruction *WorkFn = new LoadInst(VoidPtrTy, WorkFnAI, "worker.work_fn",
4529
                                       StateMachineBeginBB);
4530
    WorkFn->setDebugLoc(DLoc);
4531

4532
    FunctionType *ParallelRegionFnTy = FunctionType::get(
4533
        Type::getVoidTy(Ctx), {Type::getInt16Ty(Ctx), Type::getInt32Ty(Ctx)},
4534
        false);
4535

4536
    Instruction *IsDone =
4537
        ICmpInst::Create(ICmpInst::ICmp, llvm::CmpInst::ICMP_EQ, WorkFn,
4538
                         Constant::getNullValue(VoidPtrTy), "worker.is_done",
4539
                         StateMachineBeginBB);
4540
    IsDone->setDebugLoc(DLoc);
4541
    BranchInst::Create(StateMachineFinishedBB, StateMachineIsActiveCheckBB,
4542
                       IsDone, StateMachineBeginBB)
4543
        ->setDebugLoc(DLoc);
4544

4545
    BranchInst::Create(StateMachineIfCascadeCurrentBB,
4546
                       StateMachineDoneBarrierBB, IsActiveWorker,
4547
                       StateMachineIsActiveCheckBB)
4548
        ->setDebugLoc(DLoc);
4549

4550
    Value *ZeroArg =
4551
        Constant::getNullValue(ParallelRegionFnTy->getParamType(0));
4552

4553
    const unsigned int WrapperFunctionArgNo = 6;
4554

4555
    // Now that we have most of the CFG skeleton it is time for the if-cascade
4556
    // that checks the function pointer we got from the runtime against the
4557
    // parallel regions we expect, if there are any.
4558
    for (int I = 0, E = ReachedKnownParallelRegions.size(); I < E; ++I) {
4559
      auto *CB = ReachedKnownParallelRegions[I];
4560
      auto *ParallelRegion = dyn_cast<Function>(
4561
          CB->getArgOperand(WrapperFunctionArgNo)->stripPointerCasts());
4562
      BasicBlock *PRExecuteBB = BasicBlock::Create(
4563
          Ctx, "worker_state_machine.parallel_region.execute", Kernel,
4564
          StateMachineEndParallelBB);
4565
      CallInst::Create(ParallelRegion, {ZeroArg, GTid}, "", PRExecuteBB)
4566
          ->setDebugLoc(DLoc);
4567
      BranchInst::Create(StateMachineEndParallelBB, PRExecuteBB)
4568
          ->setDebugLoc(DLoc);
4569

4570
      BasicBlock *PRNextBB =
4571
          BasicBlock::Create(Ctx, "worker_state_machine.parallel_region.check",
4572
                             Kernel, StateMachineEndParallelBB);
4573
      A.registerManifestAddedBasicBlock(*PRExecuteBB);
4574
      A.registerManifestAddedBasicBlock(*PRNextBB);
4575

4576
      // Check if we need to compare the pointer at all or if we can just
4577
      // call the parallel region function.
4578
      Value *IsPR;
4579
      if (I + 1 < E || !ReachedUnknownParallelRegions.empty()) {
4580
        Instruction *CmpI = ICmpInst::Create(
4581
            ICmpInst::ICmp, llvm::CmpInst::ICMP_EQ, WorkFn, ParallelRegion,
4582
            "worker.check_parallel_region", StateMachineIfCascadeCurrentBB);
4583
        CmpI->setDebugLoc(DLoc);
4584
        IsPR = CmpI;
4585
      } else {
4586
        IsPR = ConstantInt::getTrue(Ctx);
4587
      }
4588

4589
      BranchInst::Create(PRExecuteBB, PRNextBB, IsPR,
4590
                         StateMachineIfCascadeCurrentBB)
4591
          ->setDebugLoc(DLoc);
4592
      StateMachineIfCascadeCurrentBB = PRNextBB;
4593
    }
4594

4595
    // At the end of the if-cascade we place the indirect function pointer call
4596
    // in case we might need it, that is if there can be parallel regions we
4597
    // have not handled in the if-cascade above.
4598
    if (!ReachedUnknownParallelRegions.empty()) {
4599
      StateMachineIfCascadeCurrentBB->setName(
4600
          "worker_state_machine.parallel_region.fallback.execute");
4601
      CallInst::Create(ParallelRegionFnTy, WorkFn, {ZeroArg, GTid}, "",
4602
                       StateMachineIfCascadeCurrentBB)
4603
          ->setDebugLoc(DLoc);
4604
    }
4605
    BranchInst::Create(StateMachineEndParallelBB,
4606
                       StateMachineIfCascadeCurrentBB)
4607
        ->setDebugLoc(DLoc);
4608

4609
    FunctionCallee EndParallelFn =
4610
        OMPInfoCache.OMPBuilder.getOrCreateRuntimeFunction(
4611
            M, OMPRTL___kmpc_kernel_end_parallel);
4612
    CallInst *EndParallel =
4613
        CallInst::Create(EndParallelFn, {}, "", StateMachineEndParallelBB);
4614
    OMPInfoCache.setCallingConvention(EndParallelFn, EndParallel);
4615
    EndParallel->setDebugLoc(DLoc);
4616
    BranchInst::Create(StateMachineDoneBarrierBB, StateMachineEndParallelBB)
4617
        ->setDebugLoc(DLoc);
4618

4619
    CallInst::Create(BarrierFn, {Ident, GTid}, "", StateMachineDoneBarrierBB)
4620
        ->setDebugLoc(DLoc);
4621
    BranchInst::Create(StateMachineBeginBB, StateMachineDoneBarrierBB)
4622
        ->setDebugLoc(DLoc);
4623

4624
    return true;
4625
  }
4626

4627
  /// Fixpoint iteration update function. Will be called every time a dependence
4628
  /// changed its state (and in the beginning).
4629
  ChangeStatus updateImpl(Attributor &A) override {
4630
    KernelInfoState StateBefore = getState();
4631

4632
    // When we leave this function this RAII will make sure the member
4633
    // KernelEnvC is updated properly depending on the state. That member is
4634
    // used for simplification of values and needs to be up to date at all
4635
    // times.
4636
    struct UpdateKernelEnvCRAII {
4637
      AAKernelInfoFunction &AA;
4638

4639
      UpdateKernelEnvCRAII(AAKernelInfoFunction &AA) : AA(AA) {}
4640

4641
      ~UpdateKernelEnvCRAII() {
4642
        if (!AA.KernelEnvC)
4643
          return;
4644

4645
        ConstantStruct *ExistingKernelEnvC =
4646
            KernelInfo::getKernelEnvironementFromKernelInitCB(AA.KernelInitCB);
4647

4648
        if (!AA.isValidState()) {
4649
          AA.KernelEnvC = ExistingKernelEnvC;
4650
          return;
4651
        }
4652

4653
        if (!AA.ReachedKnownParallelRegions.isValidState())
4654
          AA.setUseGenericStateMachineOfKernelEnvironment(
4655
              KernelInfo::getUseGenericStateMachineFromKernelEnvironment(
4656
                  ExistingKernelEnvC));
4657

4658
        if (!AA.SPMDCompatibilityTracker.isValidState())
4659
          AA.setExecModeOfKernelEnvironment(
4660
              KernelInfo::getExecModeFromKernelEnvironment(ExistingKernelEnvC));
4661

4662
        ConstantInt *MayUseNestedParallelismC =
4663
            KernelInfo::getMayUseNestedParallelismFromKernelEnvironment(
4664
                AA.KernelEnvC);
4665
        ConstantInt *NewMayUseNestedParallelismC = ConstantInt::get(
4666
            MayUseNestedParallelismC->getIntegerType(), AA.NestedParallelism);
4667
        AA.setMayUseNestedParallelismOfKernelEnvironment(
4668
            NewMayUseNestedParallelismC);
4669
      }
4670
    } RAII(*this);
4671

4672
    // Callback to check a read/write instruction.
4673
    auto CheckRWInst = [&](Instruction &I) {
4674
      // We handle calls later.
4675
      if (isa<CallBase>(I))
4676
        return true;
4677
      // We only care about write effects.
4678
      if (!I.mayWriteToMemory())
4679
        return true;
4680
      if (auto *SI = dyn_cast<StoreInst>(&I)) {
4681
        const auto *UnderlyingObjsAA = A.getAAFor<AAUnderlyingObjects>(
4682
            *this, IRPosition::value(*SI->getPointerOperand()),
4683
            DepClassTy::OPTIONAL);
4684
        auto *HS = A.getAAFor<AAHeapToStack>(
4685
            *this, IRPosition::function(*I.getFunction()),
4686
            DepClassTy::OPTIONAL);
4687
        if (UnderlyingObjsAA &&
4688
            UnderlyingObjsAA->forallUnderlyingObjects([&](Value &Obj) {
4689
              if (AA::isAssumedThreadLocalObject(A, Obj, *this))
4690
                return true;
4691
              // Check for AAHeapToStack moved objects which must not be
4692
              // guarded.
4693
              auto *CB = dyn_cast<CallBase>(&Obj);
4694
              return CB && HS && HS->isAssumedHeapToStack(*CB);
4695
            }))
4696
          return true;
4697
      }
4698

4699
      // Insert instruction that needs guarding.
4700
      SPMDCompatibilityTracker.insert(&I);
4701
      return true;
4702
    };
4703

4704
    bool UsedAssumedInformationInCheckRWInst = false;
4705
    if (!SPMDCompatibilityTracker.isAtFixpoint())
4706
      if (!A.checkForAllReadWriteInstructions(
4707
              CheckRWInst, *this, UsedAssumedInformationInCheckRWInst))
4708
        SPMDCompatibilityTracker.indicatePessimisticFixpoint();
4709

4710
    bool UsedAssumedInformationFromReachingKernels = false;
4711
    if (!IsKernelEntry) {
4712
      updateParallelLevels(A);
4713

4714
      bool AllReachingKernelsKnown = true;
4715
      updateReachingKernelEntries(A, AllReachingKernelsKnown);
4716
      UsedAssumedInformationFromReachingKernels = !AllReachingKernelsKnown;
4717

4718
      if (!SPMDCompatibilityTracker.empty()) {
4719
        if (!ParallelLevels.isValidState())
4720
          SPMDCompatibilityTracker.indicatePessimisticFixpoint();
4721
        else if (!ReachingKernelEntries.isValidState())
4722
          SPMDCompatibilityTracker.indicatePessimisticFixpoint();
4723
        else {
4724
          // Check if all reaching kernels agree on the mode as we can otherwise
4725
          // not guard instructions. We might not be sure about the mode so we
4726
          // we cannot fix the internal spmd-zation state either.
4727
          int SPMD = 0, Generic = 0;
4728
          for (auto *Kernel : ReachingKernelEntries) {
4729
            auto *CBAA = A.getAAFor<AAKernelInfo>(
4730
                *this, IRPosition::function(*Kernel), DepClassTy::OPTIONAL);
4731
            if (CBAA && CBAA->SPMDCompatibilityTracker.isValidState() &&
4732
                CBAA->SPMDCompatibilityTracker.isAssumed())
4733
              ++SPMD;
4734
            else
4735
              ++Generic;
4736
            if (!CBAA || !CBAA->SPMDCompatibilityTracker.isAtFixpoint())
4737
              UsedAssumedInformationFromReachingKernels = true;
4738
          }
4739
          if (SPMD != 0 && Generic != 0)
4740
            SPMDCompatibilityTracker.indicatePessimisticFixpoint();
4741
        }
4742
      }
4743
    }
4744

4745
    // Callback to check a call instruction.
4746
    bool AllParallelRegionStatesWereFixed = true;
4747
    bool AllSPMDStatesWereFixed = true;
4748
    auto CheckCallInst = [&](Instruction &I) {
4749
      auto &CB = cast<CallBase>(I);
4750
      auto *CBAA = A.getAAFor<AAKernelInfo>(
4751
          *this, IRPosition::callsite_function(CB), DepClassTy::OPTIONAL);
4752
      if (!CBAA)
4753
        return false;
4754
      getState() ^= CBAA->getState();
4755
      AllSPMDStatesWereFixed &= CBAA->SPMDCompatibilityTracker.isAtFixpoint();
4756
      AllParallelRegionStatesWereFixed &=
4757
          CBAA->ReachedKnownParallelRegions.isAtFixpoint();
4758
      AllParallelRegionStatesWereFixed &=
4759
          CBAA->ReachedUnknownParallelRegions.isAtFixpoint();
4760
      return true;
4761
    };
4762

4763
    bool UsedAssumedInformationInCheckCallInst = false;
4764
    if (!A.checkForAllCallLikeInstructions(
4765
            CheckCallInst, *this, UsedAssumedInformationInCheckCallInst)) {
4766
      LLVM_DEBUG(dbgs() << TAG
4767
                        << "Failed to visit all call-like instructions!\n";);
4768
      return indicatePessimisticFixpoint();
4769
    }
4770

4771
    // If we haven't used any assumed information for the reached parallel
4772
    // region states we can fix it.
4773
    if (!UsedAssumedInformationInCheckCallInst &&
4774
        AllParallelRegionStatesWereFixed) {
4775
      ReachedKnownParallelRegions.indicateOptimisticFixpoint();
4776
      ReachedUnknownParallelRegions.indicateOptimisticFixpoint();
4777
    }
4778

4779
    // If we haven't used any assumed information for the SPMD state we can fix
4780
    // it.
4781
    if (!UsedAssumedInformationInCheckRWInst &&
4782
        !UsedAssumedInformationInCheckCallInst &&
4783
        !UsedAssumedInformationFromReachingKernels && AllSPMDStatesWereFixed)
4784
      SPMDCompatibilityTracker.indicateOptimisticFixpoint();
4785

4786
    return StateBefore == getState() ? ChangeStatus::UNCHANGED
4787
                                     : ChangeStatus::CHANGED;
4788
  }
4789

4790
private:
4791
  /// Update info regarding reaching kernels.
4792
  void updateReachingKernelEntries(Attributor &A,
4793
                                   bool &AllReachingKernelsKnown) {
4794
    auto PredCallSite = [&](AbstractCallSite ACS) {
4795
      Function *Caller = ACS.getInstruction()->getFunction();
4796

4797
      assert(Caller && "Caller is nullptr");
4798

4799
      auto *CAA = A.getOrCreateAAFor<AAKernelInfo>(
4800
          IRPosition::function(*Caller), this, DepClassTy::REQUIRED);
4801
      if (CAA && CAA->ReachingKernelEntries.isValidState()) {
4802
        ReachingKernelEntries ^= CAA->ReachingKernelEntries;
4803
        return true;
4804
      }
4805

4806
      // We lost track of the caller of the associated function, any kernel
4807
      // could reach now.
4808
      ReachingKernelEntries.indicatePessimisticFixpoint();
4809

4810
      return true;
4811
    };
4812

4813
    if (!A.checkForAllCallSites(PredCallSite, *this,
4814
                                true /* RequireAllCallSites */,
4815
                                AllReachingKernelsKnown))
4816
      ReachingKernelEntries.indicatePessimisticFixpoint();
4817
  }
4818

4819
  /// Update info regarding parallel levels.
4820
  void updateParallelLevels(Attributor &A) {
4821
    auto &OMPInfoCache = static_cast<OMPInformationCache &>(A.getInfoCache());
4822
    OMPInformationCache::RuntimeFunctionInfo &Parallel51RFI =
4823
        OMPInfoCache.RFIs[OMPRTL___kmpc_parallel_51];
4824

4825
    auto PredCallSite = [&](AbstractCallSite ACS) {
4826
      Function *Caller = ACS.getInstruction()->getFunction();
4827

4828
      assert(Caller && "Caller is nullptr");
4829

4830
      auto *CAA =
4831
          A.getOrCreateAAFor<AAKernelInfo>(IRPosition::function(*Caller));
4832
      if (CAA && CAA->ParallelLevels.isValidState()) {
4833
        // Any function that is called by `__kmpc_parallel_51` will not be
4834
        // folded as the parallel level in the function is updated. In order to
4835
        // get it right, all the analysis would depend on the implentation. That
4836
        // said, if in the future any change to the implementation, the analysis
4837
        // could be wrong. As a consequence, we are just conservative here.
4838
        if (Caller == Parallel51RFI.Declaration) {
4839
          ParallelLevels.indicatePessimisticFixpoint();
4840
          return true;
4841
        }
4842

4843
        ParallelLevels ^= CAA->ParallelLevels;
4844

4845
        return true;
4846
      }
4847

4848
      // We lost track of the caller of the associated function, any kernel
4849
      // could reach now.
4850
      ParallelLevels.indicatePessimisticFixpoint();
4851

4852
      return true;
4853
    };
4854

4855
    bool AllCallSitesKnown = true;
4856
    if (!A.checkForAllCallSites(PredCallSite, *this,
4857
                                true /* RequireAllCallSites */,
4858
                                AllCallSitesKnown))
4859
      ParallelLevels.indicatePessimisticFixpoint();
4860
  }
4861
};
4862

4863
/// The call site kernel info abstract attribute, basically, what can we say
4864
/// about a call site with regards to the KernelInfoState. For now this simply
4865
/// forwards the information from the callee.
4866
struct AAKernelInfoCallSite : AAKernelInfo {
4867
  AAKernelInfoCallSite(const IRPosition &IRP, Attributor &A)
4868
      : AAKernelInfo(IRP, A) {}
4869

4870
  /// See AbstractAttribute::initialize(...).
4871
  void initialize(Attributor &A) override {
4872
    AAKernelInfo::initialize(A);
4873

4874
    CallBase &CB = cast<CallBase>(getAssociatedValue());
4875
    auto *AssumptionAA = A.getAAFor<AAAssumptionInfo>(
4876
        *this, IRPosition::callsite_function(CB), DepClassTy::OPTIONAL);
4877

4878
    // Check for SPMD-mode assumptions.
4879
    if (AssumptionAA && AssumptionAA->hasAssumption("ompx_spmd_amenable")) {
4880
      indicateOptimisticFixpoint();
4881
      return;
4882
    }
4883

4884
    // First weed out calls we do not care about, that is readonly/readnone
4885
    // calls, intrinsics, and "no_openmp" calls. Neither of these can reach a
4886
    // parallel region or anything else we are looking for.
4887
    if (!CB.mayWriteToMemory() || isa<IntrinsicInst>(CB)) {
4888
      indicateOptimisticFixpoint();
4889
      return;
4890
    }
4891

4892
    // Next we check if we know the callee. If it is a known OpenMP function
4893
    // we will handle them explicitly in the switch below. If it is not, we
4894
    // will use an AAKernelInfo object on the callee to gather information and
4895
    // merge that into the current state. The latter happens in the updateImpl.
4896
    auto CheckCallee = [&](Function *Callee, unsigned NumCallees) {
4897
      auto &OMPInfoCache = static_cast<OMPInformationCache &>(A.getInfoCache());
4898
      const auto &It = OMPInfoCache.RuntimeFunctionIDMap.find(Callee);
4899
      if (It == OMPInfoCache.RuntimeFunctionIDMap.end()) {
4900
        // Unknown caller or declarations are not analyzable, we give up.
4901
        if (!Callee || !A.isFunctionIPOAmendable(*Callee)) {
4902

4903
          // Unknown callees might contain parallel regions, except if they have
4904
          // an appropriate assumption attached.
4905
          if (!AssumptionAA ||
4906
              !(AssumptionAA->hasAssumption("omp_no_openmp") ||
4907
                AssumptionAA->hasAssumption("omp_no_parallelism")))
4908
            ReachedUnknownParallelRegions.insert(&CB);
4909

4910
          // If SPMDCompatibilityTracker is not fixed, we need to give up on the
4911
          // idea we can run something unknown in SPMD-mode.
4912
          if (!SPMDCompatibilityTracker.isAtFixpoint()) {
4913
            SPMDCompatibilityTracker.indicatePessimisticFixpoint();
4914
            SPMDCompatibilityTracker.insert(&CB);
4915
          }
4916

4917
          // We have updated the state for this unknown call properly, there
4918
          // won't be any change so we indicate a fixpoint.
4919
          indicateOptimisticFixpoint();
4920
        }
4921
        // If the callee is known and can be used in IPO, we will update the
4922
        // state based on the callee state in updateImpl.
4923
        return;
4924
      }
4925
      if (NumCallees > 1) {
4926
        indicatePessimisticFixpoint();
4927
        return;
4928
      }
4929

4930
      RuntimeFunction RF = It->getSecond();
4931
      switch (RF) {
4932
      // All the functions we know are compatible with SPMD mode.
4933
      case OMPRTL___kmpc_is_spmd_exec_mode:
4934
      case OMPRTL___kmpc_distribute_static_fini:
4935
      case OMPRTL___kmpc_for_static_fini:
4936
      case OMPRTL___kmpc_global_thread_num:
4937
      case OMPRTL___kmpc_get_hardware_num_threads_in_block:
4938
      case OMPRTL___kmpc_get_hardware_num_blocks:
4939
      case OMPRTL___kmpc_single:
4940
      case OMPRTL___kmpc_end_single:
4941
      case OMPRTL___kmpc_master:
4942
      case OMPRTL___kmpc_end_master:
4943
      case OMPRTL___kmpc_barrier:
4944
      case OMPRTL___kmpc_nvptx_parallel_reduce_nowait_v2:
4945
      case OMPRTL___kmpc_nvptx_teams_reduce_nowait_v2:
4946
      case OMPRTL___kmpc_error:
4947
      case OMPRTL___kmpc_flush:
4948
      case OMPRTL___kmpc_get_hardware_thread_id_in_block:
4949
      case OMPRTL___kmpc_get_warp_size:
4950
      case OMPRTL_omp_get_thread_num:
4951
      case OMPRTL_omp_get_num_threads:
4952
      case OMPRTL_omp_get_max_threads:
4953
      case OMPRTL_omp_in_parallel:
4954
      case OMPRTL_omp_get_dynamic:
4955
      case OMPRTL_omp_get_cancellation:
4956
      case OMPRTL_omp_get_nested:
4957
      case OMPRTL_omp_get_schedule:
4958
      case OMPRTL_omp_get_thread_limit:
4959
      case OMPRTL_omp_get_supported_active_levels:
4960
      case OMPRTL_omp_get_max_active_levels:
4961
      case OMPRTL_omp_get_level:
4962
      case OMPRTL_omp_get_ancestor_thread_num:
4963
      case OMPRTL_omp_get_team_size:
4964
      case OMPRTL_omp_get_active_level:
4965
      case OMPRTL_omp_in_final:
4966
      case OMPRTL_omp_get_proc_bind:
4967
      case OMPRTL_omp_get_num_places:
4968
      case OMPRTL_omp_get_num_procs:
4969
      case OMPRTL_omp_get_place_proc_ids:
4970
      case OMPRTL_omp_get_place_num:
4971
      case OMPRTL_omp_get_partition_num_places:
4972
      case OMPRTL_omp_get_partition_place_nums:
4973
      case OMPRTL_omp_get_wtime:
4974
        break;
4975
      case OMPRTL___kmpc_distribute_static_init_4:
4976
      case OMPRTL___kmpc_distribute_static_init_4u:
4977
      case OMPRTL___kmpc_distribute_static_init_8:
4978
      case OMPRTL___kmpc_distribute_static_init_8u:
4979
      case OMPRTL___kmpc_for_static_init_4:
4980
      case OMPRTL___kmpc_for_static_init_4u:
4981
      case OMPRTL___kmpc_for_static_init_8:
4982
      case OMPRTL___kmpc_for_static_init_8u: {
4983
        // Check the schedule and allow static schedule in SPMD mode.
4984
        unsigned ScheduleArgOpNo = 2;
4985
        auto *ScheduleTypeCI =
4986
            dyn_cast<ConstantInt>(CB.getArgOperand(ScheduleArgOpNo));
4987
        unsigned ScheduleTypeVal =
4988
            ScheduleTypeCI ? ScheduleTypeCI->getZExtValue() : 0;
4989
        switch (OMPScheduleType(ScheduleTypeVal)) {
4990
        case OMPScheduleType::UnorderedStatic:
4991
        case OMPScheduleType::UnorderedStaticChunked:
4992
        case OMPScheduleType::OrderedDistribute:
4993
        case OMPScheduleType::OrderedDistributeChunked:
4994
          break;
4995
        default:
4996
          SPMDCompatibilityTracker.indicatePessimisticFixpoint();
4997
          SPMDCompatibilityTracker.insert(&CB);
4998
          break;
4999
        };
5000
      } break;
5001
      case OMPRTL___kmpc_target_init:
5002
        KernelInitCB = &CB;
5003
        break;
5004
      case OMPRTL___kmpc_target_deinit:
5005
        KernelDeinitCB = &CB;
5006
        break;
5007
      case OMPRTL___kmpc_parallel_51:
5008
        if (!handleParallel51(A, CB))
5009
          indicatePessimisticFixpoint();
5010
        return;
5011
      case OMPRTL___kmpc_omp_task:
5012
        // We do not look into tasks right now, just give up.
5013
        SPMDCompatibilityTracker.indicatePessimisticFixpoint();
5014
        SPMDCompatibilityTracker.insert(&CB);
5015
        ReachedUnknownParallelRegions.insert(&CB);
5016
        break;
5017
      case OMPRTL___kmpc_alloc_shared:
5018
      case OMPRTL___kmpc_free_shared:
5019
        // Return without setting a fixpoint, to be resolved in updateImpl.
5020
        return;
5021
      default:
5022
        // Unknown OpenMP runtime calls cannot be executed in SPMD-mode,
5023
        // generally. However, they do not hide parallel regions.
5024
        SPMDCompatibilityTracker.indicatePessimisticFixpoint();
5025
        SPMDCompatibilityTracker.insert(&CB);
5026
        break;
5027
      }
5028
      // All other OpenMP runtime calls will not reach parallel regions so they
5029
      // can be safely ignored for now. Since it is a known OpenMP runtime call
5030
      // we have now modeled all effects and there is no need for any update.
5031
      indicateOptimisticFixpoint();
5032
    };
5033

5034
    const auto *AACE =
5035
        A.getAAFor<AACallEdges>(*this, getIRPosition(), DepClassTy::OPTIONAL);
5036
    if (!AACE || !AACE->getState().isValidState() || AACE->hasUnknownCallee()) {
5037
      CheckCallee(getAssociatedFunction(), 1);
5038
      return;
5039
    }
5040
    const auto &OptimisticEdges = AACE->getOptimisticEdges();
5041
    for (auto *Callee : OptimisticEdges) {
5042
      CheckCallee(Callee, OptimisticEdges.size());
5043
      if (isAtFixpoint())
5044
        break;
5045
    }
5046
  }
5047

5048
  ChangeStatus updateImpl(Attributor &A) override {
5049
    // TODO: Once we have call site specific value information we can provide
5050
    //       call site specific liveness information and then it makes
5051
    //       sense to specialize attributes for call sites arguments instead of
5052
    //       redirecting requests to the callee argument.
5053
    auto &OMPInfoCache = static_cast<OMPInformationCache &>(A.getInfoCache());
5054
    KernelInfoState StateBefore = getState();
5055

5056
    auto CheckCallee = [&](Function *F, int NumCallees) {
5057
      const auto &It = OMPInfoCache.RuntimeFunctionIDMap.find(F);
5058

5059
      // If F is not a runtime function, propagate the AAKernelInfo of the
5060
      // callee.
5061
      if (It == OMPInfoCache.RuntimeFunctionIDMap.end()) {
5062
        const IRPosition &FnPos = IRPosition::function(*F);
5063
        auto *FnAA =
5064
            A.getAAFor<AAKernelInfo>(*this, FnPos, DepClassTy::REQUIRED);
5065
        if (!FnAA)
5066
          return indicatePessimisticFixpoint();
5067
        if (getState() == FnAA->getState())
5068
          return ChangeStatus::UNCHANGED;
5069
        getState() = FnAA->getState();
5070
        return ChangeStatus::CHANGED;
5071
      }
5072
      if (NumCallees > 1)
5073
        return indicatePessimisticFixpoint();
5074

5075
      CallBase &CB = cast<CallBase>(getAssociatedValue());
5076
      if (It->getSecond() == OMPRTL___kmpc_parallel_51) {
5077
        if (!handleParallel51(A, CB))
5078
          return indicatePessimisticFixpoint();
5079
        return StateBefore == getState() ? ChangeStatus::UNCHANGED
5080
                                         : ChangeStatus::CHANGED;
5081
      }
5082

5083
      // F is a runtime function that allocates or frees memory, check
5084
      // AAHeapToStack and AAHeapToShared.
5085
      assert(
5086
          (It->getSecond() == OMPRTL___kmpc_alloc_shared ||
5087
           It->getSecond() == OMPRTL___kmpc_free_shared) &&
5088
          "Expected a __kmpc_alloc_shared or __kmpc_free_shared runtime call");
5089

5090
      auto *HeapToStackAA = A.getAAFor<AAHeapToStack>(
5091
          *this, IRPosition::function(*CB.getCaller()), DepClassTy::OPTIONAL);
5092
      auto *HeapToSharedAA = A.getAAFor<AAHeapToShared>(
5093
          *this, IRPosition::function(*CB.getCaller()), DepClassTy::OPTIONAL);
5094

5095
      RuntimeFunction RF = It->getSecond();
5096

5097
      switch (RF) {
5098
      // If neither HeapToStack nor HeapToShared assume the call is removed,
5099
      // assume SPMD incompatibility.
5100
      case OMPRTL___kmpc_alloc_shared:
5101
        if ((!HeapToStackAA || !HeapToStackAA->isAssumedHeapToStack(CB)) &&
5102
            (!HeapToSharedAA || !HeapToSharedAA->isAssumedHeapToShared(CB)))
5103
          SPMDCompatibilityTracker.insert(&CB);
5104
        break;
5105
      case OMPRTL___kmpc_free_shared:
5106
        if ((!HeapToStackAA ||
5107
             !HeapToStackAA->isAssumedHeapToStackRemovedFree(CB)) &&
5108
            (!HeapToSharedAA ||
5109
             !HeapToSharedAA->isAssumedHeapToSharedRemovedFree(CB)))
5110
          SPMDCompatibilityTracker.insert(&CB);
5111
        break;
5112
      default:
5113
        SPMDCompatibilityTracker.indicatePessimisticFixpoint();
5114
        SPMDCompatibilityTracker.insert(&CB);
5115
      }
5116
      return ChangeStatus::CHANGED;
5117
    };
5118

5119
    const auto *AACE =
5120
        A.getAAFor<AACallEdges>(*this, getIRPosition(), DepClassTy::OPTIONAL);
5121
    if (!AACE || !AACE->getState().isValidState() || AACE->hasUnknownCallee()) {
5122
      if (Function *F = getAssociatedFunction())
5123
        CheckCallee(F, /*NumCallees=*/1);
5124
    } else {
5125
      const auto &OptimisticEdges = AACE->getOptimisticEdges();
5126
      for (auto *Callee : OptimisticEdges) {
5127
        CheckCallee(Callee, OptimisticEdges.size());
5128
        if (isAtFixpoint())
5129
          break;
5130
      }
5131
    }
5132

5133
    return StateBefore == getState() ? ChangeStatus::UNCHANGED
5134
                                     : ChangeStatus::CHANGED;
5135
  }
5136

5137
  /// Deal with a __kmpc_parallel_51 call (\p CB). Returns true if the call was
5138
  /// handled, if a problem occurred, false is returned.
5139
  bool handleParallel51(Attributor &A, CallBase &CB) {
5140
    const unsigned int NonWrapperFunctionArgNo = 5;
5141
    const unsigned int WrapperFunctionArgNo = 6;
5142
    auto ParallelRegionOpArgNo = SPMDCompatibilityTracker.isAssumed()
5143
                                     ? NonWrapperFunctionArgNo
5144
                                     : WrapperFunctionArgNo;
5145

5146
    auto *ParallelRegion = dyn_cast<Function>(
5147
        CB.getArgOperand(ParallelRegionOpArgNo)->stripPointerCasts());
5148
    if (!ParallelRegion)
5149
      return false;
5150

5151
    ReachedKnownParallelRegions.insert(&CB);
5152
    /// Check nested parallelism
5153
    auto *FnAA = A.getAAFor<AAKernelInfo>(
5154
        *this, IRPosition::function(*ParallelRegion), DepClassTy::OPTIONAL);
5155
    NestedParallelism |= !FnAA || !FnAA->getState().isValidState() ||
5156
                         !FnAA->ReachedKnownParallelRegions.empty() ||
5157
                         !FnAA->ReachedKnownParallelRegions.isValidState() ||
5158
                         !FnAA->ReachedUnknownParallelRegions.isValidState() ||
5159
                         !FnAA->ReachedUnknownParallelRegions.empty();
5160
    return true;
5161
  }
5162
};
5163

5164
struct AAFoldRuntimeCall
5165
    : public StateWrapper<BooleanState, AbstractAttribute> {
5166
  using Base = StateWrapper<BooleanState, AbstractAttribute>;
5167

5168
  AAFoldRuntimeCall(const IRPosition &IRP, Attributor &A) : Base(IRP) {}
5169

5170
  /// Statistics are tracked as part of manifest for now.
5171
  void trackStatistics() const override {}
5172

5173
  /// Create an abstract attribute biew for the position \p IRP.
5174
  static AAFoldRuntimeCall &createForPosition(const IRPosition &IRP,
5175
                                              Attributor &A);
5176

5177
  /// See AbstractAttribute::getName()
5178
  const std::string getName() const override { return "AAFoldRuntimeCall"; }
5179

5180
  /// See AbstractAttribute::getIdAddr()
5181
  const char *getIdAddr() const override { return &ID; }
5182

5183
  /// This function should return true if the type of the \p AA is
5184
  /// AAFoldRuntimeCall
5185
  static bool classof(const AbstractAttribute *AA) {
5186
    return (AA->getIdAddr() == &ID);
5187
  }
5188

5189
  static const char ID;
5190
};
5191

5192
struct AAFoldRuntimeCallCallSiteReturned : AAFoldRuntimeCall {
5193
  AAFoldRuntimeCallCallSiteReturned(const IRPosition &IRP, Attributor &A)
5194
      : AAFoldRuntimeCall(IRP, A) {}
5195

5196
  /// See AbstractAttribute::getAsStr()
5197
  const std::string getAsStr(Attributor *) const override {
5198
    if (!isValidState())
5199
      return "<invalid>";
5200

5201
    std::string Str("simplified value: ");
5202

5203
    if (!SimplifiedValue)
5204
      return Str + std::string("none");
5205

5206
    if (!*SimplifiedValue)
5207
      return Str + std::string("nullptr");
5208

5209
    if (ConstantInt *CI = dyn_cast<ConstantInt>(*SimplifiedValue))
5210
      return Str + std::to_string(CI->getSExtValue());
5211

5212
    return Str + std::string("unknown");
5213
  }
5214

5215
  void initialize(Attributor &A) override {
5216
    if (DisableOpenMPOptFolding)
5217
      indicatePessimisticFixpoint();
5218

5219
    Function *Callee = getAssociatedFunction();
5220

5221
    auto &OMPInfoCache = static_cast<OMPInformationCache &>(A.getInfoCache());
5222
    const auto &It = OMPInfoCache.RuntimeFunctionIDMap.find(Callee);
5223
    assert(It != OMPInfoCache.RuntimeFunctionIDMap.end() &&
5224
           "Expected a known OpenMP runtime function");
5225

5226
    RFKind = It->getSecond();
5227

5228
    CallBase &CB = cast<CallBase>(getAssociatedValue());
5229
    A.registerSimplificationCallback(
5230
        IRPosition::callsite_returned(CB),
5231
        [&](const IRPosition &IRP, const AbstractAttribute *AA,
5232
            bool &UsedAssumedInformation) -> std::optional<Value *> {
5233
          assert((isValidState() ||
5234
                  (SimplifiedValue && *SimplifiedValue == nullptr)) &&
5235
                 "Unexpected invalid state!");
5236

5237
          if (!isAtFixpoint()) {
5238
            UsedAssumedInformation = true;
5239
            if (AA)
5240
              A.recordDependence(*this, *AA, DepClassTy::OPTIONAL);
5241
          }
5242
          return SimplifiedValue;
5243
        });
5244
  }
5245

5246
  ChangeStatus updateImpl(Attributor &A) override {
5247
    ChangeStatus Changed = ChangeStatus::UNCHANGED;
5248
    switch (RFKind) {
5249
    case OMPRTL___kmpc_is_spmd_exec_mode:
5250
      Changed |= foldIsSPMDExecMode(A);
5251
      break;
5252
    case OMPRTL___kmpc_parallel_level:
5253
      Changed |= foldParallelLevel(A);
5254
      break;
5255
    case OMPRTL___kmpc_get_hardware_num_threads_in_block:
5256
      Changed = Changed | foldKernelFnAttribute(A, "omp_target_thread_limit");
5257
      break;
5258
    case OMPRTL___kmpc_get_hardware_num_blocks:
5259
      Changed = Changed | foldKernelFnAttribute(A, "omp_target_num_teams");
5260
      break;
5261
    default:
5262
      llvm_unreachable("Unhandled OpenMP runtime function!");
5263
    }
5264

5265
    return Changed;
5266
  }
5267

5268
  ChangeStatus manifest(Attributor &A) override {
5269
    ChangeStatus Changed = ChangeStatus::UNCHANGED;
5270

5271
    if (SimplifiedValue && *SimplifiedValue) {
5272
      Instruction &I = *getCtxI();
5273
      A.changeAfterManifest(IRPosition::inst(I), **SimplifiedValue);
5274
      A.deleteAfterManifest(I);
5275

5276
      CallBase *CB = dyn_cast<CallBase>(&I);
5277
      auto Remark = [&](OptimizationRemark OR) {
5278
        if (auto *C = dyn_cast<ConstantInt>(*SimplifiedValue))
5279
          return OR << "Replacing OpenMP runtime call "
5280
                    << CB->getCalledFunction()->getName() << " with "
5281
                    << ore::NV("FoldedValue", C->getZExtValue()) << ".";
5282
        return OR << "Replacing OpenMP runtime call "
5283
                  << CB->getCalledFunction()->getName() << ".";
5284
      };
5285

5286
      if (CB && EnableVerboseRemarks)
5287
        A.emitRemark<OptimizationRemark>(CB, "OMP180", Remark);
5288

5289
      LLVM_DEBUG(dbgs() << TAG << "Replacing runtime call: " << I << " with "
5290
                        << **SimplifiedValue << "\n");
5291

5292
      Changed = ChangeStatus::CHANGED;
5293
    }
5294

5295
    return Changed;
5296
  }
5297

5298
  ChangeStatus indicatePessimisticFixpoint() override {
5299
    SimplifiedValue = nullptr;
5300
    return AAFoldRuntimeCall::indicatePessimisticFixpoint();
5301
  }
5302

5303
private:
5304
  /// Fold __kmpc_is_spmd_exec_mode into a constant if possible.
5305
  ChangeStatus foldIsSPMDExecMode(Attributor &A) {
5306
    std::optional<Value *> SimplifiedValueBefore = SimplifiedValue;
5307

5308
    unsigned AssumedSPMDCount = 0, KnownSPMDCount = 0;
5309
    unsigned AssumedNonSPMDCount = 0, KnownNonSPMDCount = 0;
5310
    auto *CallerKernelInfoAA = A.getAAFor<AAKernelInfo>(
5311
        *this, IRPosition::function(*getAnchorScope()), DepClassTy::REQUIRED);
5312

5313
    if (!CallerKernelInfoAA ||
5314
        !CallerKernelInfoAA->ReachingKernelEntries.isValidState())
5315
      return indicatePessimisticFixpoint();
5316

5317
    for (Kernel K : CallerKernelInfoAA->ReachingKernelEntries) {
5318
      auto *AA = A.getAAFor<AAKernelInfo>(*this, IRPosition::function(*K),
5319
                                          DepClassTy::REQUIRED);
5320

5321
      if (!AA || !AA->isValidState()) {
5322
        SimplifiedValue = nullptr;
5323
        return indicatePessimisticFixpoint();
5324
      }
5325

5326
      if (AA->SPMDCompatibilityTracker.isAssumed()) {
5327
        if (AA->SPMDCompatibilityTracker.isAtFixpoint())
5328
          ++KnownSPMDCount;
5329
        else
5330
          ++AssumedSPMDCount;
5331
      } else {
5332
        if (AA->SPMDCompatibilityTracker.isAtFixpoint())
5333
          ++KnownNonSPMDCount;
5334
        else
5335
          ++AssumedNonSPMDCount;
5336
      }
5337
    }
5338

5339
    if ((AssumedSPMDCount + KnownSPMDCount) &&
5340
        (AssumedNonSPMDCount + KnownNonSPMDCount))
5341
      return indicatePessimisticFixpoint();
5342

5343
    auto &Ctx = getAnchorValue().getContext();
5344
    if (KnownSPMDCount || AssumedSPMDCount) {
5345
      assert(KnownNonSPMDCount == 0 && AssumedNonSPMDCount == 0 &&
5346
             "Expected only SPMD kernels!");
5347
      // All reaching kernels are in SPMD mode. Update all function calls to
5348
      // __kmpc_is_spmd_exec_mode to 1.
5349
      SimplifiedValue = ConstantInt::get(Type::getInt8Ty(Ctx), true);
5350
    } else if (KnownNonSPMDCount || AssumedNonSPMDCount) {
5351
      assert(KnownSPMDCount == 0 && AssumedSPMDCount == 0 &&
5352
             "Expected only non-SPMD kernels!");
5353
      // All reaching kernels are in non-SPMD mode. Update all function
5354
      // calls to __kmpc_is_spmd_exec_mode to 0.
5355
      SimplifiedValue = ConstantInt::get(Type::getInt8Ty(Ctx), false);
5356
    } else {
5357
      // We have empty reaching kernels, therefore we cannot tell if the
5358
      // associated call site can be folded. At this moment, SimplifiedValue
5359
      // must be none.
5360
      assert(!SimplifiedValue && "SimplifiedValue should be none");
5361
    }
5362

5363
    return SimplifiedValue == SimplifiedValueBefore ? ChangeStatus::UNCHANGED
5364
                                                    : ChangeStatus::CHANGED;
5365
  }
5366

5367
  /// Fold __kmpc_parallel_level into a constant if possible.
5368
  ChangeStatus foldParallelLevel(Attributor &A) {
5369
    std::optional<Value *> SimplifiedValueBefore = SimplifiedValue;
5370

5371
    auto *CallerKernelInfoAA = A.getAAFor<AAKernelInfo>(
5372
        *this, IRPosition::function(*getAnchorScope()), DepClassTy::REQUIRED);
5373

5374
    if (!CallerKernelInfoAA ||
5375
        !CallerKernelInfoAA->ParallelLevels.isValidState())
5376
      return indicatePessimisticFixpoint();
5377

5378
    if (!CallerKernelInfoAA->ReachingKernelEntries.isValidState())
5379
      return indicatePessimisticFixpoint();
5380

5381
    if (CallerKernelInfoAA->ReachingKernelEntries.empty()) {
5382
      assert(!SimplifiedValue &&
5383
             "SimplifiedValue should keep none at this point");
5384
      return ChangeStatus::UNCHANGED;
5385
    }
5386

5387
    unsigned AssumedSPMDCount = 0, KnownSPMDCount = 0;
5388
    unsigned AssumedNonSPMDCount = 0, KnownNonSPMDCount = 0;
5389
    for (Kernel K : CallerKernelInfoAA->ReachingKernelEntries) {
5390
      auto *AA = A.getAAFor<AAKernelInfo>(*this, IRPosition::function(*K),
5391
                                          DepClassTy::REQUIRED);
5392
      if (!AA || !AA->SPMDCompatibilityTracker.isValidState())
5393
        return indicatePessimisticFixpoint();
5394

5395
      if (AA->SPMDCompatibilityTracker.isAssumed()) {
5396
        if (AA->SPMDCompatibilityTracker.isAtFixpoint())
5397
          ++KnownSPMDCount;
5398
        else
5399
          ++AssumedSPMDCount;
5400
      } else {
5401
        if (AA->SPMDCompatibilityTracker.isAtFixpoint())
5402
          ++KnownNonSPMDCount;
5403
        else
5404
          ++AssumedNonSPMDCount;
5405
      }
5406
    }
5407

5408
    if ((AssumedSPMDCount + KnownSPMDCount) &&
5409
        (AssumedNonSPMDCount + KnownNonSPMDCount))
5410
      return indicatePessimisticFixpoint();
5411

5412
    auto &Ctx = getAnchorValue().getContext();
5413
    // If the caller can only be reached by SPMD kernel entries, the parallel
5414
    // level is 1. Similarly, if the caller can only be reached by non-SPMD
5415
    // kernel entries, it is 0.
5416
    if (AssumedSPMDCount || KnownSPMDCount) {
5417
      assert(KnownNonSPMDCount == 0 && AssumedNonSPMDCount == 0 &&
5418
             "Expected only SPMD kernels!");
5419
      SimplifiedValue = ConstantInt::get(Type::getInt8Ty(Ctx), 1);
5420
    } else {
5421
      assert(KnownSPMDCount == 0 && AssumedSPMDCount == 0 &&
5422
             "Expected only non-SPMD kernels!");
5423
      SimplifiedValue = ConstantInt::get(Type::getInt8Ty(Ctx), 0);
5424
    }
5425
    return SimplifiedValue == SimplifiedValueBefore ? ChangeStatus::UNCHANGED
5426
                                                    : ChangeStatus::CHANGED;
5427
  }
5428

5429
  ChangeStatus foldKernelFnAttribute(Attributor &A, llvm::StringRef Attr) {
5430
    // Specialize only if all the calls agree with the attribute constant value
5431
    int32_t CurrentAttrValue = -1;
5432
    std::optional<Value *> SimplifiedValueBefore = SimplifiedValue;
5433

5434
    auto *CallerKernelInfoAA = A.getAAFor<AAKernelInfo>(
5435
        *this, IRPosition::function(*getAnchorScope()), DepClassTy::REQUIRED);
5436

5437
    if (!CallerKernelInfoAA ||
5438
        !CallerKernelInfoAA->ReachingKernelEntries.isValidState())
5439
      return indicatePessimisticFixpoint();
5440

5441
    // Iterate over the kernels that reach this function
5442
    for (Kernel K : CallerKernelInfoAA->ReachingKernelEntries) {
5443
      int32_t NextAttrVal = K->getFnAttributeAsParsedInteger(Attr, -1);
5444

5445
      if (NextAttrVal == -1 ||
5446
          (CurrentAttrValue != -1 && CurrentAttrValue != NextAttrVal))
5447
        return indicatePessimisticFixpoint();
5448
      CurrentAttrValue = NextAttrVal;
5449
    }
5450

5451
    if (CurrentAttrValue != -1) {
5452
      auto &Ctx = getAnchorValue().getContext();
5453
      SimplifiedValue =
5454
          ConstantInt::get(Type::getInt32Ty(Ctx), CurrentAttrValue);
5455
    }
5456
    return SimplifiedValue == SimplifiedValueBefore ? ChangeStatus::UNCHANGED
5457
                                                    : ChangeStatus::CHANGED;
5458
  }
5459

5460
  /// An optional value the associated value is assumed to fold to. That is, we
5461
  /// assume the associated value (which is a call) can be replaced by this
5462
  /// simplified value.
5463
  std::optional<Value *> SimplifiedValue;
5464

5465
  /// The runtime function kind of the callee of the associated call site.
5466
  RuntimeFunction RFKind;
5467
};
5468

5469
} // namespace
5470

5471
/// Register folding callsite
5472
void OpenMPOpt::registerFoldRuntimeCall(RuntimeFunction RF) {
5473
  auto &RFI = OMPInfoCache.RFIs[RF];
5474
  RFI.foreachUse(SCC, [&](Use &U, Function &F) {
5475
    CallInst *CI = OpenMPOpt::getCallIfRegularCall(U, &RFI);
5476
    if (!CI)
5477
      return false;
5478
    A.getOrCreateAAFor<AAFoldRuntimeCall>(
5479
        IRPosition::callsite_returned(*CI), /* QueryingAA */ nullptr,
5480
        DepClassTy::NONE, /* ForceUpdate */ false,
5481
        /* UpdateAfterInit */ false);
5482
    return false;
5483
  });
5484
}
5485

5486
void OpenMPOpt::registerAAs(bool IsModulePass) {
5487
  if (SCC.empty())
5488
    return;
5489

5490
  if (IsModulePass) {
5491
    // Ensure we create the AAKernelInfo AAs first and without triggering an
5492
    // update. This will make sure we register all value simplification
5493
    // callbacks before any other AA has the chance to create an AAValueSimplify
5494
    // or similar.
5495
    auto CreateKernelInfoCB = [&](Use &, Function &Kernel) {
5496
      A.getOrCreateAAFor<AAKernelInfo>(
5497
          IRPosition::function(Kernel), /* QueryingAA */ nullptr,
5498
          DepClassTy::NONE, /* ForceUpdate */ false,
5499
          /* UpdateAfterInit */ false);
5500
      return false;
5501
    };
5502
    OMPInformationCache::RuntimeFunctionInfo &InitRFI =
5503
        OMPInfoCache.RFIs[OMPRTL___kmpc_target_init];
5504
    InitRFI.foreachUse(SCC, CreateKernelInfoCB);
5505

5506
    registerFoldRuntimeCall(OMPRTL___kmpc_is_spmd_exec_mode);
5507
    registerFoldRuntimeCall(OMPRTL___kmpc_parallel_level);
5508
    registerFoldRuntimeCall(OMPRTL___kmpc_get_hardware_num_threads_in_block);
5509
    registerFoldRuntimeCall(OMPRTL___kmpc_get_hardware_num_blocks);
5510
  }
5511

5512
  // Create CallSite AA for all Getters.
5513
  if (DeduceICVValues) {
5514
    for (int Idx = 0; Idx < OMPInfoCache.ICVs.size() - 1; ++Idx) {
5515
      auto ICVInfo = OMPInfoCache.ICVs[static_cast<InternalControlVar>(Idx)];
5516

5517
      auto &GetterRFI = OMPInfoCache.RFIs[ICVInfo.Getter];
5518

5519
      auto CreateAA = [&](Use &U, Function &Caller) {
5520
        CallInst *CI = OpenMPOpt::getCallIfRegularCall(U, &GetterRFI);
5521
        if (!CI)
5522
          return false;
5523

5524
        auto &CB = cast<CallBase>(*CI);
5525

5526
        IRPosition CBPos = IRPosition::callsite_function(CB);
5527
        A.getOrCreateAAFor<AAICVTracker>(CBPos);
5528
        return false;
5529
      };
5530

5531
      GetterRFI.foreachUse(SCC, CreateAA);
5532
    }
5533
  }
5534

5535
  // Create an ExecutionDomain AA for every function and a HeapToStack AA for
5536
  // every function if there is a device kernel.
5537
  if (!isOpenMPDevice(M))
5538
    return;
5539

5540
  for (auto *F : SCC) {
5541
    if (F->isDeclaration())
5542
      continue;
5543

5544
    // We look at internal functions only on-demand but if any use is not a
5545
    // direct call or outside the current set of analyzed functions, we have
5546
    // to do it eagerly.
5547
    if (F->hasLocalLinkage()) {
5548
      if (llvm::all_of(F->uses(), [this](const Use &U) {
5549
            const auto *CB = dyn_cast<CallBase>(U.getUser());
5550
            return CB && CB->isCallee(&U) &&
5551
                   A.isRunOn(const_cast<Function *>(CB->getCaller()));
5552
          }))
5553
        continue;
5554
    }
5555
    registerAAsForFunction(A, *F);
5556
  }
5557
}
5558

5559
void OpenMPOpt::registerAAsForFunction(Attributor &A, const Function &F) {
5560
  if (!DisableOpenMPOptDeglobalization)
5561
    A.getOrCreateAAFor<AAHeapToShared>(IRPosition::function(F));
5562
  A.getOrCreateAAFor<AAExecutionDomain>(IRPosition::function(F));
5563
  if (!DisableOpenMPOptDeglobalization)
5564
    A.getOrCreateAAFor<AAHeapToStack>(IRPosition::function(F));
5565
  if (F.hasFnAttribute(Attribute::Convergent))
5566
    A.getOrCreateAAFor<AANonConvergent>(IRPosition::function(F));
5567

5568
  for (auto &I : instructions(F)) {
5569
    if (auto *LI = dyn_cast<LoadInst>(&I)) {
5570
      bool UsedAssumedInformation = false;
5571
      A.getAssumedSimplified(IRPosition::value(*LI), /* AA */ nullptr,
5572
                             UsedAssumedInformation, AA::Interprocedural);
5573
      continue;
5574
    }
5575
    if (auto *CI = dyn_cast<CallBase>(&I)) {
5576
      if (CI->isIndirectCall())
5577
        A.getOrCreateAAFor<AAIndirectCallInfo>(
5578
            IRPosition::callsite_function(*CI));
5579
    }
5580
    if (auto *SI = dyn_cast<StoreInst>(&I)) {
5581
      A.getOrCreateAAFor<AAIsDead>(IRPosition::value(*SI));
5582
      continue;
5583
    }
5584
    if (auto *FI = dyn_cast<FenceInst>(&I)) {
5585
      A.getOrCreateAAFor<AAIsDead>(IRPosition::value(*FI));
5586
      continue;
5587
    }
5588
    if (auto *II = dyn_cast<IntrinsicInst>(&I)) {
5589
      if (II->getIntrinsicID() == Intrinsic::assume) {
5590
        A.getOrCreateAAFor<AAPotentialValues>(
5591
            IRPosition::value(*II->getArgOperand(0)));
5592
        continue;
5593
      }
5594
    }
5595
  }
5596
}
5597

5598
const char AAICVTracker::ID = 0;
5599
const char AAKernelInfo::ID = 0;
5600
const char AAExecutionDomain::ID = 0;
5601
const char AAHeapToShared::ID = 0;
5602
const char AAFoldRuntimeCall::ID = 0;
5603

5604
AAICVTracker &AAICVTracker::createForPosition(const IRPosition &IRP,
5605
                                              Attributor &A) {
5606
  AAICVTracker *AA = nullptr;
5607
  switch (IRP.getPositionKind()) {
5608
  case IRPosition::IRP_INVALID:
5609
  case IRPosition::IRP_FLOAT:
5610
  case IRPosition::IRP_ARGUMENT:
5611
  case IRPosition::IRP_CALL_SITE_ARGUMENT:
5612
    llvm_unreachable("ICVTracker can only be created for function position!");
5613
  case IRPosition::IRP_RETURNED:
5614
    AA = new (A.Allocator) AAICVTrackerFunctionReturned(IRP, A);
5615
    break;
5616
  case IRPosition::IRP_CALL_SITE_RETURNED:
5617
    AA = new (A.Allocator) AAICVTrackerCallSiteReturned(IRP, A);
5618
    break;
5619
  case IRPosition::IRP_CALL_SITE:
5620
    AA = new (A.Allocator) AAICVTrackerCallSite(IRP, A);
5621
    break;
5622
  case IRPosition::IRP_FUNCTION:
5623
    AA = new (A.Allocator) AAICVTrackerFunction(IRP, A);
5624
    break;
5625
  }
5626

5627
  return *AA;
5628
}
5629

5630
AAExecutionDomain &AAExecutionDomain::createForPosition(const IRPosition &IRP,
5631
                                                        Attributor &A) {
5632
  AAExecutionDomainFunction *AA = nullptr;
5633
  switch (IRP.getPositionKind()) {
5634
  case IRPosition::IRP_INVALID:
5635
  case IRPosition::IRP_FLOAT:
5636
  case IRPosition::IRP_ARGUMENT:
5637
  case IRPosition::IRP_CALL_SITE_ARGUMENT:
5638
  case IRPosition::IRP_RETURNED:
5639
  case IRPosition::IRP_CALL_SITE_RETURNED:
5640
  case IRPosition::IRP_CALL_SITE:
5641
    llvm_unreachable(
5642
        "AAExecutionDomain can only be created for function position!");
5643
  case IRPosition::IRP_FUNCTION:
5644
    AA = new (A.Allocator) AAExecutionDomainFunction(IRP, A);
5645
    break;
5646
  }
5647

5648
  return *AA;
5649
}
5650

5651
AAHeapToShared &AAHeapToShared::createForPosition(const IRPosition &IRP,
5652
                                                  Attributor &A) {
5653
  AAHeapToSharedFunction *AA = nullptr;
5654
  switch (IRP.getPositionKind()) {
5655
  case IRPosition::IRP_INVALID:
5656
  case IRPosition::IRP_FLOAT:
5657
  case IRPosition::IRP_ARGUMENT:
5658
  case IRPosition::IRP_CALL_SITE_ARGUMENT:
5659
  case IRPosition::IRP_RETURNED:
5660
  case IRPosition::IRP_CALL_SITE_RETURNED:
5661
  case IRPosition::IRP_CALL_SITE:
5662
    llvm_unreachable(
5663
        "AAHeapToShared can only be created for function position!");
5664
  case IRPosition::IRP_FUNCTION:
5665
    AA = new (A.Allocator) AAHeapToSharedFunction(IRP, A);
5666
    break;
5667
  }
5668

5669
  return *AA;
5670
}
5671

5672
AAKernelInfo &AAKernelInfo::createForPosition(const IRPosition &IRP,
5673
                                              Attributor &A) {
5674
  AAKernelInfo *AA = nullptr;
5675
  switch (IRP.getPositionKind()) {
5676
  case IRPosition::IRP_INVALID:
5677
  case IRPosition::IRP_FLOAT:
5678
  case IRPosition::IRP_ARGUMENT:
5679
  case IRPosition::IRP_RETURNED:
5680
  case IRPosition::IRP_CALL_SITE_RETURNED:
5681
  case IRPosition::IRP_CALL_SITE_ARGUMENT:
5682
    llvm_unreachable("KernelInfo can only be created for function position!");
5683
  case IRPosition::IRP_CALL_SITE:
5684
    AA = new (A.Allocator) AAKernelInfoCallSite(IRP, A);
5685
    break;
5686
  case IRPosition::IRP_FUNCTION:
5687
    AA = new (A.Allocator) AAKernelInfoFunction(IRP, A);
5688
    break;
5689
  }
5690

5691
  return *AA;
5692
}
5693

5694
AAFoldRuntimeCall &AAFoldRuntimeCall::createForPosition(const IRPosition &IRP,
5695
                                                        Attributor &A) {
5696
  AAFoldRuntimeCall *AA = nullptr;
5697
  switch (IRP.getPositionKind()) {
5698
  case IRPosition::IRP_INVALID:
5699
  case IRPosition::IRP_FLOAT:
5700
  case IRPosition::IRP_ARGUMENT:
5701
  case IRPosition::IRP_RETURNED:
5702
  case IRPosition::IRP_FUNCTION:
5703
  case IRPosition::IRP_CALL_SITE:
5704
  case IRPosition::IRP_CALL_SITE_ARGUMENT:
5705
    llvm_unreachable("KernelInfo can only be created for call site position!");
5706
  case IRPosition::IRP_CALL_SITE_RETURNED:
5707
    AA = new (A.Allocator) AAFoldRuntimeCallCallSiteReturned(IRP, A);
5708
    break;
5709
  }
5710

5711
  return *AA;
5712
}
5713

5714
PreservedAnalyses OpenMPOptPass::run(Module &M, ModuleAnalysisManager &AM) {
5715
  if (!containsOpenMP(M))
5716
    return PreservedAnalyses::all();
5717
  if (DisableOpenMPOptimizations)
5718
    return PreservedAnalyses::all();
5719

5720
  FunctionAnalysisManager &FAM =
5721
      AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
5722
  KernelSet Kernels = getDeviceKernels(M);
5723

5724
  if (PrintModuleBeforeOptimizations)
5725
    LLVM_DEBUG(dbgs() << TAG << "Module before OpenMPOpt Module Pass:\n" << M);
5726

5727
  auto IsCalled = [&](Function &F) {
5728
    if (Kernels.contains(&F))
5729
      return true;
5730
    for (const User *U : F.users())
5731
      if (!isa<BlockAddress>(U))
5732
        return true;
5733
    return false;
5734
  };
5735

5736
  auto EmitRemark = [&](Function &F) {
5737
    auto &ORE = FAM.getResult<OptimizationRemarkEmitterAnalysis>(F);
5738
    ORE.emit([&]() {
5739
      OptimizationRemarkAnalysis ORA(DEBUG_TYPE, "OMP140", &F);
5740
      return ORA << "Could not internalize function. "
5741
                 << "Some optimizations may not be possible. [OMP140]";
5742
    });
5743
  };
5744

5745
  bool Changed = false;
5746

5747
  // Create internal copies of each function if this is a kernel Module. This
5748
  // allows iterprocedural passes to see every call edge.
5749
  DenseMap<Function *, Function *> InternalizedMap;
5750
  if (isOpenMPDevice(M)) {
5751
    SmallPtrSet<Function *, 16> InternalizeFns;
5752
    for (Function &F : M)
5753
      if (!F.isDeclaration() && !Kernels.contains(&F) && IsCalled(F) &&
5754
          !DisableInternalization) {
5755
        if (Attributor::isInternalizable(F)) {
5756
          InternalizeFns.insert(&F);
5757
        } else if (!F.hasLocalLinkage() && !F.hasFnAttribute(Attribute::Cold)) {
5758
          EmitRemark(F);
5759
        }
5760
      }
5761

5762
    Changed |=
5763
        Attributor::internalizeFunctions(InternalizeFns, InternalizedMap);
5764
  }
5765

5766
  // Look at every function in the Module unless it was internalized.
5767
  SetVector<Function *> Functions;
5768
  SmallVector<Function *, 16> SCC;
5769
  for (Function &F : M)
5770
    if (!F.isDeclaration() && !InternalizedMap.lookup(&F)) {
5771
      SCC.push_back(&F);
5772
      Functions.insert(&F);
5773
    }
5774

5775
  if (SCC.empty())
5776
    return Changed ? PreservedAnalyses::none() : PreservedAnalyses::all();
5777

5778
  AnalysisGetter AG(FAM);
5779

5780
  auto OREGetter = [&FAM](Function *F) -> OptimizationRemarkEmitter & {
5781
    return FAM.getResult<OptimizationRemarkEmitterAnalysis>(*F);
5782
  };
5783

5784
  BumpPtrAllocator Allocator;
5785
  CallGraphUpdater CGUpdater;
5786

5787
  bool PostLink = LTOPhase == ThinOrFullLTOPhase::FullLTOPostLink ||
5788
                  LTOPhase == ThinOrFullLTOPhase::ThinLTOPreLink;
5789
  OMPInformationCache InfoCache(M, AG, Allocator, /*CGSCC*/ nullptr, PostLink);
5790

5791
  unsigned MaxFixpointIterations =
5792
      (isOpenMPDevice(M)) ? SetFixpointIterations : 32;
5793

5794
  AttributorConfig AC(CGUpdater);
5795
  AC.DefaultInitializeLiveInternals = false;
5796
  AC.IsModulePass = true;
5797
  AC.RewriteSignatures = false;
5798
  AC.MaxFixpointIterations = MaxFixpointIterations;
5799
  AC.OREGetter = OREGetter;
5800
  AC.PassName = DEBUG_TYPE;
5801
  AC.InitializationCallback = OpenMPOpt::registerAAsForFunction;
5802
  AC.IPOAmendableCB = [](const Function &F) {
5803
    return F.hasFnAttribute("kernel");
5804
  };
5805

5806
  Attributor A(Functions, InfoCache, AC);
5807

5808
  OpenMPOpt OMPOpt(SCC, CGUpdater, OREGetter, InfoCache, A);
5809
  Changed |= OMPOpt.run(true);
5810

5811
  // Optionally inline device functions for potentially better performance.
5812
  if (AlwaysInlineDeviceFunctions && isOpenMPDevice(M))
5813
    for (Function &F : M)
5814
      if (!F.isDeclaration() && !Kernels.contains(&F) &&
5815
          !F.hasFnAttribute(Attribute::NoInline))
5816
        F.addFnAttr(Attribute::AlwaysInline);
5817

5818
  if (PrintModuleAfterOptimizations)
5819
    LLVM_DEBUG(dbgs() << TAG << "Module after OpenMPOpt Module Pass:\n" << M);
5820

5821
  if (Changed)
5822
    return PreservedAnalyses::none();
5823

5824
  return PreservedAnalyses::all();
5825
}
5826

5827
PreservedAnalyses OpenMPOptCGSCCPass::run(LazyCallGraph::SCC &C,
5828
                                          CGSCCAnalysisManager &AM,
5829
                                          LazyCallGraph &CG,
5830
                                          CGSCCUpdateResult &UR) {
5831
  if (!containsOpenMP(*C.begin()->getFunction().getParent()))
5832
    return PreservedAnalyses::all();
5833
  if (DisableOpenMPOptimizations)
5834
    return PreservedAnalyses::all();
5835

5836
  SmallVector<Function *, 16> SCC;
5837
  // If there are kernels in the module, we have to run on all SCC's.
5838
  for (LazyCallGraph::Node &N : C) {
5839
    Function *Fn = &N.getFunction();
5840
    SCC.push_back(Fn);
5841
  }
5842

5843
  if (SCC.empty())
5844
    return PreservedAnalyses::all();
5845

5846
  Module &M = *C.begin()->getFunction().getParent();
5847

5848
  if (PrintModuleBeforeOptimizations)
5849
    LLVM_DEBUG(dbgs() << TAG << "Module before OpenMPOpt CGSCC Pass:\n" << M);
5850

5851
  KernelSet Kernels = getDeviceKernels(M);
5852

5853
  FunctionAnalysisManager &FAM =
5854
      AM.getResult<FunctionAnalysisManagerCGSCCProxy>(C, CG).getManager();
5855

5856
  AnalysisGetter AG(FAM);
5857

5858
  auto OREGetter = [&FAM](Function *F) -> OptimizationRemarkEmitter & {
5859
    return FAM.getResult<OptimizationRemarkEmitterAnalysis>(*F);
5860
  };
5861

5862
  BumpPtrAllocator Allocator;
5863
  CallGraphUpdater CGUpdater;
5864
  CGUpdater.initialize(CG, C, AM, UR);
5865

5866
  bool PostLink = LTOPhase == ThinOrFullLTOPhase::FullLTOPostLink ||
5867
                  LTOPhase == ThinOrFullLTOPhase::ThinLTOPreLink;
5868
  SetVector<Function *> Functions(SCC.begin(), SCC.end());
5869
  OMPInformationCache InfoCache(*(Functions.back()->getParent()), AG, Allocator,
5870
                                /*CGSCC*/ &Functions, PostLink);
5871

5872
  unsigned MaxFixpointIterations =
5873
      (isOpenMPDevice(M)) ? SetFixpointIterations : 32;
5874

5875
  AttributorConfig AC(CGUpdater);
5876
  AC.DefaultInitializeLiveInternals = false;
5877
  AC.IsModulePass = false;
5878
  AC.RewriteSignatures = false;
5879
  AC.MaxFixpointIterations = MaxFixpointIterations;
5880
  AC.OREGetter = OREGetter;
5881
  AC.PassName = DEBUG_TYPE;
5882
  AC.InitializationCallback = OpenMPOpt::registerAAsForFunction;
5883

5884
  Attributor A(Functions, InfoCache, AC);
5885

5886
  OpenMPOpt OMPOpt(SCC, CGUpdater, OREGetter, InfoCache, A);
5887
  bool Changed = OMPOpt.run(false);
5888

5889
  if (PrintModuleAfterOptimizations)
5890
    LLVM_DEBUG(dbgs() << TAG << "Module after OpenMPOpt CGSCC Pass:\n" << M);
5891

5892
  if (Changed)
5893
    return PreservedAnalyses::none();
5894

5895
  return PreservedAnalyses::all();
5896
}
5897

5898
bool llvm::omp::isOpenMPKernel(Function &Fn) {
5899
  return Fn.hasFnAttribute("kernel");
5900
}
5901

5902
KernelSet llvm::omp::getDeviceKernels(Module &M) {
5903
  // TODO: Create a more cross-platform way of determining device kernels.
5904
  NamedMDNode *MD = M.getNamedMetadata("nvvm.annotations");
5905
  KernelSet Kernels;
5906

5907
  if (!MD)
5908
    return Kernels;
5909

5910
  for (auto *Op : MD->operands()) {
5911
    if (Op->getNumOperands() < 2)
5912
      continue;
5913
    MDString *KindID = dyn_cast<MDString>(Op->getOperand(1));
5914
    if (!KindID || KindID->getString() != "kernel")
5915
      continue;
5916

5917
    Function *KernelFn =
5918
        mdconst::dyn_extract_or_null<Function>(Op->getOperand(0));
5919
    if (!KernelFn)
5920
      continue;
5921

5922
    // We are only interested in OpenMP target regions. Others, such as kernels
5923
    // generated by CUDA but linked together, are not interesting to this pass.
5924
    if (isOpenMPKernel(*KernelFn)) {
5925
      ++NumOpenMPTargetRegionKernels;
5926
      Kernels.insert(KernelFn);
5927
    } else
5928
      ++NumNonOpenMPTargetRegionKernels;
5929
  }
5930

5931
  return Kernels;
5932
}
5933

5934
bool llvm::omp::containsOpenMP(Module &M) {
5935
  Metadata *MD = M.getModuleFlag("openmp");
5936
  if (!MD)
5937
    return false;
5938

5939
  return true;
5940
}
5941

5942
bool llvm::omp::isOpenMPDevice(Module &M) {
5943
  Metadata *MD = M.getModuleFlag("openmp-device");
5944
  if (!MD)
5945
    return false;
5946

5947
  return true;
5948
}
5949

5950
Product

Resources

Company