Path: blob/main/contrib/llvm-project/llvm/lib/CodeGen/ExpandMemCmp.cpp
35234 views
//===--- ExpandMemCmp.cpp - Expand memcmp() to load/stores ----------------===//1//2// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.3// See https://llvm.org/LICENSE.txt for license information.4// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception5//6//===----------------------------------------------------------------------===//7//8// This pass tries to expand memcmp() calls into optimally-sized loads and9// compares for the target.10//11//===----------------------------------------------------------------------===//1213#include "llvm/CodeGen/ExpandMemCmp.h"14#include "llvm/ADT/Statistic.h"15#include "llvm/Analysis/ConstantFolding.h"16#include "llvm/Analysis/DomTreeUpdater.h"17#include "llvm/Analysis/LazyBlockFrequencyInfo.h"18#include "llvm/Analysis/ProfileSummaryInfo.h"19#include "llvm/Analysis/TargetLibraryInfo.h"20#include "llvm/Analysis/TargetTransformInfo.h"21#include "llvm/Analysis/ValueTracking.h"22#include "llvm/CodeGen/TargetPassConfig.h"23#include "llvm/CodeGen/TargetSubtargetInfo.h"24#include "llvm/IR/Dominators.h"25#include "llvm/IR/IRBuilder.h"26#include "llvm/IR/PatternMatch.h"27#include "llvm/InitializePasses.h"28#include "llvm/Target/TargetMachine.h"29#include "llvm/Transforms/Utils/BasicBlockUtils.h"30#include "llvm/Transforms/Utils/Local.h"31#include "llvm/Transforms/Utils/SizeOpts.h"32#include <optional>3334using namespace llvm;35using namespace llvm::PatternMatch;3637namespace llvm {38class TargetLowering;39}4041#define DEBUG_TYPE "expand-memcmp"4243STATISTIC(NumMemCmpCalls, "Number of memcmp calls");44STATISTIC(NumMemCmpNotConstant, "Number of memcmp calls without constant size");45STATISTIC(NumMemCmpGreaterThanMax,46"Number of memcmp calls with size greater than max size");47STATISTIC(NumMemCmpInlined, "Number of inlined memcmp calls");4849static cl::opt<unsigned> MemCmpEqZeroNumLoadsPerBlock(50"memcmp-num-loads-per-block", cl::Hidden, cl::init(1),51cl::desc("The number of loads per basic block for inline expansion of "52"memcmp that is only being compared against zero."));5354static cl::opt<unsigned> MaxLoadsPerMemcmp(55"max-loads-per-memcmp", cl::Hidden,56cl::desc("Set maximum number of loads used in expanded memcmp"));5758static cl::opt<unsigned> MaxLoadsPerMemcmpOptSize(59"max-loads-per-memcmp-opt-size", cl::Hidden,60cl::desc("Set maximum number of loads used in expanded memcmp for -Os/Oz"));6162namespace {636465// This class provides helper functions to expand a memcmp library call into an66// inline expansion.67class MemCmpExpansion {68struct ResultBlock {69BasicBlock *BB = nullptr;70PHINode *PhiSrc1 = nullptr;71PHINode *PhiSrc2 = nullptr;7273ResultBlock() = default;74};7576CallInst *const CI = nullptr;77ResultBlock ResBlock;78const uint64_t Size;79unsigned MaxLoadSize = 0;80uint64_t NumLoadsNonOneByte = 0;81const uint64_t NumLoadsPerBlockForZeroCmp;82std::vector<BasicBlock *> LoadCmpBlocks;83BasicBlock *EndBlock = nullptr;84PHINode *PhiRes = nullptr;85const bool IsUsedForZeroCmp;86const DataLayout &DL;87DomTreeUpdater *DTU = nullptr;88IRBuilder<> Builder;89// Represents the decomposition in blocks of the expansion. For example,90// comparing 33 bytes on X86+sse can be done with 2x16-byte loads and91// 1x1-byte load, which would be represented as [{16, 0}, {16, 16}, {1, 32}.92struct LoadEntry {93LoadEntry(unsigned LoadSize, uint64_t Offset)94: LoadSize(LoadSize), Offset(Offset) {95}9697// The size of the load for this block, in bytes.98unsigned LoadSize;99// The offset of this load from the base pointer, in bytes.100uint64_t Offset;101};102using LoadEntryVector = SmallVector<LoadEntry, 8>;103LoadEntryVector LoadSequence;104105void createLoadCmpBlocks();106void createResultBlock();107void setupResultBlockPHINodes();108void setupEndBlockPHINodes();109Value *getCompareLoadPairs(unsigned BlockIndex, unsigned &LoadIndex);110void emitLoadCompareBlock(unsigned BlockIndex);111void emitLoadCompareBlockMultipleLoads(unsigned BlockIndex,112unsigned &LoadIndex);113void emitLoadCompareByteBlock(unsigned BlockIndex, unsigned OffsetBytes);114void emitMemCmpResultBlock();115Value *getMemCmpExpansionZeroCase();116Value *getMemCmpEqZeroOneBlock();117Value *getMemCmpOneBlock();118struct LoadPair {119Value *Lhs = nullptr;120Value *Rhs = nullptr;121};122LoadPair getLoadPair(Type *LoadSizeType, Type *BSwapSizeType,123Type *CmpSizeType, unsigned OffsetBytes);124125static LoadEntryVector126computeGreedyLoadSequence(uint64_t Size, llvm::ArrayRef<unsigned> LoadSizes,127unsigned MaxNumLoads, unsigned &NumLoadsNonOneByte);128static LoadEntryVector129computeOverlappingLoadSequence(uint64_t Size, unsigned MaxLoadSize,130unsigned MaxNumLoads,131unsigned &NumLoadsNonOneByte);132133static void optimiseLoadSequence(134LoadEntryVector &LoadSequence,135const TargetTransformInfo::MemCmpExpansionOptions &Options,136bool IsUsedForZeroCmp);137138public:139MemCmpExpansion(CallInst *CI, uint64_t Size,140const TargetTransformInfo::MemCmpExpansionOptions &Options,141const bool IsUsedForZeroCmp, const DataLayout &TheDataLayout,142DomTreeUpdater *DTU);143144unsigned getNumBlocks();145uint64_t getNumLoads() const { return LoadSequence.size(); }146147Value *getMemCmpExpansion();148};149150MemCmpExpansion::LoadEntryVector MemCmpExpansion::computeGreedyLoadSequence(151uint64_t Size, llvm::ArrayRef<unsigned> LoadSizes,152const unsigned MaxNumLoads, unsigned &NumLoadsNonOneByte) {153NumLoadsNonOneByte = 0;154LoadEntryVector LoadSequence;155uint64_t Offset = 0;156while (Size && !LoadSizes.empty()) {157const unsigned LoadSize = LoadSizes.front();158const uint64_t NumLoadsForThisSize = Size / LoadSize;159if (LoadSequence.size() + NumLoadsForThisSize > MaxNumLoads) {160// Do not expand if the total number of loads is larger than what the161// target allows. Note that it's important that we exit before completing162// the expansion to avoid using a ton of memory to store the expansion for163// large sizes.164return {};165}166if (NumLoadsForThisSize > 0) {167for (uint64_t I = 0; I < NumLoadsForThisSize; ++I) {168LoadSequence.push_back({LoadSize, Offset});169Offset += LoadSize;170}171if (LoadSize > 1)172++NumLoadsNonOneByte;173Size = Size % LoadSize;174}175LoadSizes = LoadSizes.drop_front();176}177return LoadSequence;178}179180MemCmpExpansion::LoadEntryVector181MemCmpExpansion::computeOverlappingLoadSequence(uint64_t Size,182const unsigned MaxLoadSize,183const unsigned MaxNumLoads,184unsigned &NumLoadsNonOneByte) {185// These are already handled by the greedy approach.186if (Size < 2 || MaxLoadSize < 2)187return {};188189// We try to do as many non-overlapping loads as possible starting from the190// beginning.191const uint64_t NumNonOverlappingLoads = Size / MaxLoadSize;192assert(NumNonOverlappingLoads && "there must be at least one load");193// There remain 0 to (MaxLoadSize - 1) bytes to load, this will be done with194// an overlapping load.195Size = Size - NumNonOverlappingLoads * MaxLoadSize;196// Bail if we do not need an overloapping store, this is already handled by197// the greedy approach.198if (Size == 0)199return {};200// Bail if the number of loads (non-overlapping + potential overlapping one)201// is larger than the max allowed.202if ((NumNonOverlappingLoads + 1) > MaxNumLoads)203return {};204205// Add non-overlapping loads.206LoadEntryVector LoadSequence;207uint64_t Offset = 0;208for (uint64_t I = 0; I < NumNonOverlappingLoads; ++I) {209LoadSequence.push_back({MaxLoadSize, Offset});210Offset += MaxLoadSize;211}212213// Add the last overlapping load.214assert(Size > 0 && Size < MaxLoadSize && "broken invariant");215LoadSequence.push_back({MaxLoadSize, Offset - (MaxLoadSize - Size)});216NumLoadsNonOneByte = 1;217return LoadSequence;218}219220void MemCmpExpansion::optimiseLoadSequence(221LoadEntryVector &LoadSequence,222const TargetTransformInfo::MemCmpExpansionOptions &Options,223bool IsUsedForZeroCmp) {224// This part of code attempts to optimize the LoadSequence by merging allowed225// subsequences into single loads of allowed sizes from226// `MemCmpExpansionOptions::AllowedTailExpansions`. If it is for zero227// comparison or if no allowed tail expansions are specified, we exit early.228if (IsUsedForZeroCmp || Options.AllowedTailExpansions.empty())229return;230231while (LoadSequence.size() >= 2) {232auto Last = LoadSequence[LoadSequence.size() - 1];233auto PreLast = LoadSequence[LoadSequence.size() - 2];234235// Exit the loop if the two sequences are not contiguous236if (PreLast.Offset + PreLast.LoadSize != Last.Offset)237break;238239auto LoadSize = Last.LoadSize + PreLast.LoadSize;240if (find(Options.AllowedTailExpansions, LoadSize) ==241Options.AllowedTailExpansions.end())242break;243244// Remove the last two sequences and replace with the combined sequence245LoadSequence.pop_back();246LoadSequence.pop_back();247LoadSequence.emplace_back(PreLast.Offset, LoadSize);248}249}250251// Initialize the basic block structure required for expansion of memcmp call252// with given maximum load size and memcmp size parameter.253// This structure includes:254// 1. A list of load compare blocks - LoadCmpBlocks.255// 2. An EndBlock, split from original instruction point, which is the block to256// return from.257// 3. ResultBlock, block to branch to for early exit when a258// LoadCmpBlock finds a difference.259MemCmpExpansion::MemCmpExpansion(260CallInst *const CI, uint64_t Size,261const TargetTransformInfo::MemCmpExpansionOptions &Options,262const bool IsUsedForZeroCmp, const DataLayout &TheDataLayout,263DomTreeUpdater *DTU)264: CI(CI), Size(Size), NumLoadsPerBlockForZeroCmp(Options.NumLoadsPerBlock),265IsUsedForZeroCmp(IsUsedForZeroCmp), DL(TheDataLayout), DTU(DTU),266Builder(CI) {267assert(Size > 0 && "zero blocks");268// Scale the max size down if the target can load more bytes than we need.269llvm::ArrayRef<unsigned> LoadSizes(Options.LoadSizes);270while (!LoadSizes.empty() && LoadSizes.front() > Size) {271LoadSizes = LoadSizes.drop_front();272}273assert(!LoadSizes.empty() && "cannot load Size bytes");274MaxLoadSize = LoadSizes.front();275// Compute the decomposition.276unsigned GreedyNumLoadsNonOneByte = 0;277LoadSequence = computeGreedyLoadSequence(Size, LoadSizes, Options.MaxNumLoads,278GreedyNumLoadsNonOneByte);279NumLoadsNonOneByte = GreedyNumLoadsNonOneByte;280assert(LoadSequence.size() <= Options.MaxNumLoads && "broken invariant");281// If we allow overlapping loads and the load sequence is not already optimal,282// use overlapping loads.283if (Options.AllowOverlappingLoads &&284(LoadSequence.empty() || LoadSequence.size() > 2)) {285unsigned OverlappingNumLoadsNonOneByte = 0;286auto OverlappingLoads = computeOverlappingLoadSequence(287Size, MaxLoadSize, Options.MaxNumLoads, OverlappingNumLoadsNonOneByte);288if (!OverlappingLoads.empty() &&289(LoadSequence.empty() ||290OverlappingLoads.size() < LoadSequence.size())) {291LoadSequence = OverlappingLoads;292NumLoadsNonOneByte = OverlappingNumLoadsNonOneByte;293}294}295assert(LoadSequence.size() <= Options.MaxNumLoads && "broken invariant");296optimiseLoadSequence(LoadSequence, Options, IsUsedForZeroCmp);297}298299unsigned MemCmpExpansion::getNumBlocks() {300if (IsUsedForZeroCmp)301return getNumLoads() / NumLoadsPerBlockForZeroCmp +302(getNumLoads() % NumLoadsPerBlockForZeroCmp != 0 ? 1 : 0);303return getNumLoads();304}305306void MemCmpExpansion::createLoadCmpBlocks() {307for (unsigned i = 0; i < getNumBlocks(); i++) {308BasicBlock *BB = BasicBlock::Create(CI->getContext(), "loadbb",309EndBlock->getParent(), EndBlock);310LoadCmpBlocks.push_back(BB);311}312}313314void MemCmpExpansion::createResultBlock() {315ResBlock.BB = BasicBlock::Create(CI->getContext(), "res_block",316EndBlock->getParent(), EndBlock);317}318319MemCmpExpansion::LoadPair MemCmpExpansion::getLoadPair(Type *LoadSizeType,320Type *BSwapSizeType,321Type *CmpSizeType,322unsigned OffsetBytes) {323// Get the memory source at offset `OffsetBytes`.324Value *LhsSource = CI->getArgOperand(0);325Value *RhsSource = CI->getArgOperand(1);326Align LhsAlign = LhsSource->getPointerAlignment(DL);327Align RhsAlign = RhsSource->getPointerAlignment(DL);328if (OffsetBytes > 0) {329auto *ByteType = Type::getInt8Ty(CI->getContext());330LhsSource = Builder.CreateConstGEP1_64(ByteType, LhsSource, OffsetBytes);331RhsSource = Builder.CreateConstGEP1_64(ByteType, RhsSource, OffsetBytes);332LhsAlign = commonAlignment(LhsAlign, OffsetBytes);333RhsAlign = commonAlignment(RhsAlign, OffsetBytes);334}335336// Create a constant or a load from the source.337Value *Lhs = nullptr;338if (auto *C = dyn_cast<Constant>(LhsSource))339Lhs = ConstantFoldLoadFromConstPtr(C, LoadSizeType, DL);340if (!Lhs)341Lhs = Builder.CreateAlignedLoad(LoadSizeType, LhsSource, LhsAlign);342343Value *Rhs = nullptr;344if (auto *C = dyn_cast<Constant>(RhsSource))345Rhs = ConstantFoldLoadFromConstPtr(C, LoadSizeType, DL);346if (!Rhs)347Rhs = Builder.CreateAlignedLoad(LoadSizeType, RhsSource, RhsAlign);348349// Zero extend if Byte Swap intrinsic has different type350if (BSwapSizeType && LoadSizeType != BSwapSizeType) {351Lhs = Builder.CreateZExt(Lhs, BSwapSizeType);352Rhs = Builder.CreateZExt(Rhs, BSwapSizeType);353}354355// Swap bytes if required.356if (BSwapSizeType) {357Function *Bswap = Intrinsic::getDeclaration(358CI->getModule(), Intrinsic::bswap, BSwapSizeType);359Lhs = Builder.CreateCall(Bswap, Lhs);360Rhs = Builder.CreateCall(Bswap, Rhs);361}362363// Zero extend if required.364if (CmpSizeType != nullptr && CmpSizeType != Lhs->getType()) {365Lhs = Builder.CreateZExt(Lhs, CmpSizeType);366Rhs = Builder.CreateZExt(Rhs, CmpSizeType);367}368return {Lhs, Rhs};369}370371// This function creates the IR instructions for loading and comparing 1 byte.372// It loads 1 byte from each source of the memcmp parameters with the given373// GEPIndex. It then subtracts the two loaded values and adds this result to the374// final phi node for selecting the memcmp result.375void MemCmpExpansion::emitLoadCompareByteBlock(unsigned BlockIndex,376unsigned OffsetBytes) {377BasicBlock *BB = LoadCmpBlocks[BlockIndex];378Builder.SetInsertPoint(BB);379const LoadPair Loads =380getLoadPair(Type::getInt8Ty(CI->getContext()), nullptr,381Type::getInt32Ty(CI->getContext()), OffsetBytes);382Value *Diff = Builder.CreateSub(Loads.Lhs, Loads.Rhs);383384PhiRes->addIncoming(Diff, BB);385386if (BlockIndex < (LoadCmpBlocks.size() - 1)) {387// Early exit branch if difference found to EndBlock. Otherwise, continue to388// next LoadCmpBlock,389Value *Cmp = Builder.CreateICmp(ICmpInst::ICMP_NE, Diff,390ConstantInt::get(Diff->getType(), 0));391BranchInst *CmpBr =392BranchInst::Create(EndBlock, LoadCmpBlocks[BlockIndex + 1], Cmp);393Builder.Insert(CmpBr);394if (DTU)395DTU->applyUpdates(396{{DominatorTree::Insert, BB, EndBlock},397{DominatorTree::Insert, BB, LoadCmpBlocks[BlockIndex + 1]}});398} else {399// The last block has an unconditional branch to EndBlock.400BranchInst *CmpBr = BranchInst::Create(EndBlock);401Builder.Insert(CmpBr);402if (DTU)403DTU->applyUpdates({{DominatorTree::Insert, BB, EndBlock}});404}405}406407/// Generate an equality comparison for one or more pairs of loaded values.408/// This is used in the case where the memcmp() call is compared equal or not409/// equal to zero.410Value *MemCmpExpansion::getCompareLoadPairs(unsigned BlockIndex,411unsigned &LoadIndex) {412assert(LoadIndex < getNumLoads() &&413"getCompareLoadPairs() called with no remaining loads");414std::vector<Value *> XorList, OrList;415Value *Diff = nullptr;416417const unsigned NumLoads =418std::min(getNumLoads() - LoadIndex, NumLoadsPerBlockForZeroCmp);419420// For a single-block expansion, start inserting before the memcmp call.421if (LoadCmpBlocks.empty())422Builder.SetInsertPoint(CI);423else424Builder.SetInsertPoint(LoadCmpBlocks[BlockIndex]);425426Value *Cmp = nullptr;427// If we have multiple loads per block, we need to generate a composite428// comparison using xor+or. The type for the combinations is the largest load429// type.430IntegerType *const MaxLoadType =431NumLoads == 1 ? nullptr432: IntegerType::get(CI->getContext(), MaxLoadSize * 8);433434for (unsigned i = 0; i < NumLoads; ++i, ++LoadIndex) {435const LoadEntry &CurLoadEntry = LoadSequence[LoadIndex];436const LoadPair Loads = getLoadPair(437IntegerType::get(CI->getContext(), CurLoadEntry.LoadSize * 8), nullptr,438MaxLoadType, CurLoadEntry.Offset);439440if (NumLoads != 1) {441// If we have multiple loads per block, we need to generate a composite442// comparison using xor+or.443Diff = Builder.CreateXor(Loads.Lhs, Loads.Rhs);444Diff = Builder.CreateZExt(Diff, MaxLoadType);445XorList.push_back(Diff);446} else {447// If there's only one load per block, we just compare the loaded values.448Cmp = Builder.CreateICmpNE(Loads.Lhs, Loads.Rhs);449}450}451452auto pairWiseOr = [&](std::vector<Value *> &InList) -> std::vector<Value *> {453std::vector<Value *> OutList;454for (unsigned i = 0; i < InList.size() - 1; i = i + 2) {455Value *Or = Builder.CreateOr(InList[i], InList[i + 1]);456OutList.push_back(Or);457}458if (InList.size() % 2 != 0)459OutList.push_back(InList.back());460return OutList;461};462463if (!Cmp) {464// Pairwise OR the XOR results.465OrList = pairWiseOr(XorList);466467// Pairwise OR the OR results until one result left.468while (OrList.size() != 1) {469OrList = pairWiseOr(OrList);470}471472assert(Diff && "Failed to find comparison diff");473Cmp = Builder.CreateICmpNE(OrList[0], ConstantInt::get(Diff->getType(), 0));474}475476return Cmp;477}478479void MemCmpExpansion::emitLoadCompareBlockMultipleLoads(unsigned BlockIndex,480unsigned &LoadIndex) {481Value *Cmp = getCompareLoadPairs(BlockIndex, LoadIndex);482483BasicBlock *NextBB = (BlockIndex == (LoadCmpBlocks.size() - 1))484? EndBlock485: LoadCmpBlocks[BlockIndex + 1];486// Early exit branch if difference found to ResultBlock. Otherwise,487// continue to next LoadCmpBlock or EndBlock.488BasicBlock *BB = Builder.GetInsertBlock();489BranchInst *CmpBr = BranchInst::Create(ResBlock.BB, NextBB, Cmp);490Builder.Insert(CmpBr);491if (DTU)492DTU->applyUpdates({{DominatorTree::Insert, BB, ResBlock.BB},493{DominatorTree::Insert, BB, NextBB}});494495// Add a phi edge for the last LoadCmpBlock to Endblock with a value of 0496// since early exit to ResultBlock was not taken (no difference was found in497// any of the bytes).498if (BlockIndex == LoadCmpBlocks.size() - 1) {499Value *Zero = ConstantInt::get(Type::getInt32Ty(CI->getContext()), 0);500PhiRes->addIncoming(Zero, LoadCmpBlocks[BlockIndex]);501}502}503504// This function creates the IR intructions for loading and comparing using the505// given LoadSize. It loads the number of bytes specified by LoadSize from each506// source of the memcmp parameters. It then does a subtract to see if there was507// a difference in the loaded values. If a difference is found, it branches508// with an early exit to the ResultBlock for calculating which source was509// larger. Otherwise, it falls through to the either the next LoadCmpBlock or510// the EndBlock if this is the last LoadCmpBlock. Loading 1 byte is handled with511// a special case through emitLoadCompareByteBlock. The special handling can512// simply subtract the loaded values and add it to the result phi node.513void MemCmpExpansion::emitLoadCompareBlock(unsigned BlockIndex) {514// There is one load per block in this case, BlockIndex == LoadIndex.515const LoadEntry &CurLoadEntry = LoadSequence[BlockIndex];516517if (CurLoadEntry.LoadSize == 1) {518MemCmpExpansion::emitLoadCompareByteBlock(BlockIndex, CurLoadEntry.Offset);519return;520}521522Type *LoadSizeType =523IntegerType::get(CI->getContext(), CurLoadEntry.LoadSize * 8);524Type *BSwapSizeType =525DL.isLittleEndian()526? IntegerType::get(CI->getContext(),527PowerOf2Ceil(CurLoadEntry.LoadSize * 8))528: nullptr;529Type *MaxLoadType = IntegerType::get(530CI->getContext(),531std::max(MaxLoadSize, (unsigned)PowerOf2Ceil(CurLoadEntry.LoadSize)) * 8);532assert(CurLoadEntry.LoadSize <= MaxLoadSize && "Unexpected load type");533534Builder.SetInsertPoint(LoadCmpBlocks[BlockIndex]);535536const LoadPair Loads = getLoadPair(LoadSizeType, BSwapSizeType, MaxLoadType,537CurLoadEntry.Offset);538539// Add the loaded values to the phi nodes for calculating memcmp result only540// if result is not used in a zero equality.541if (!IsUsedForZeroCmp) {542ResBlock.PhiSrc1->addIncoming(Loads.Lhs, LoadCmpBlocks[BlockIndex]);543ResBlock.PhiSrc2->addIncoming(Loads.Rhs, LoadCmpBlocks[BlockIndex]);544}545546Value *Cmp = Builder.CreateICmp(ICmpInst::ICMP_EQ, Loads.Lhs, Loads.Rhs);547BasicBlock *NextBB = (BlockIndex == (LoadCmpBlocks.size() - 1))548? EndBlock549: LoadCmpBlocks[BlockIndex + 1];550// Early exit branch if difference found to ResultBlock. Otherwise, continue551// to next LoadCmpBlock or EndBlock.552BasicBlock *BB = Builder.GetInsertBlock();553BranchInst *CmpBr = BranchInst::Create(NextBB, ResBlock.BB, Cmp);554Builder.Insert(CmpBr);555if (DTU)556DTU->applyUpdates({{DominatorTree::Insert, BB, NextBB},557{DominatorTree::Insert, BB, ResBlock.BB}});558559// Add a phi edge for the last LoadCmpBlock to Endblock with a value of 0560// since early exit to ResultBlock was not taken (no difference was found in561// any of the bytes).562if (BlockIndex == LoadCmpBlocks.size() - 1) {563Value *Zero = ConstantInt::get(Type::getInt32Ty(CI->getContext()), 0);564PhiRes->addIncoming(Zero, LoadCmpBlocks[BlockIndex]);565}566}567568// This function populates the ResultBlock with a sequence to calculate the569// memcmp result. It compares the two loaded source values and returns -1 if570// src1 < src2 and 1 if src1 > src2.571void MemCmpExpansion::emitMemCmpResultBlock() {572// Special case: if memcmp result is used in a zero equality, result does not573// need to be calculated and can simply return 1.574if (IsUsedForZeroCmp) {575BasicBlock::iterator InsertPt = ResBlock.BB->getFirstInsertionPt();576Builder.SetInsertPoint(ResBlock.BB, InsertPt);577Value *Res = ConstantInt::get(Type::getInt32Ty(CI->getContext()), 1);578PhiRes->addIncoming(Res, ResBlock.BB);579BranchInst *NewBr = BranchInst::Create(EndBlock);580Builder.Insert(NewBr);581if (DTU)582DTU->applyUpdates({{DominatorTree::Insert, ResBlock.BB, EndBlock}});583return;584}585BasicBlock::iterator InsertPt = ResBlock.BB->getFirstInsertionPt();586Builder.SetInsertPoint(ResBlock.BB, InsertPt);587588Value *Cmp = Builder.CreateICmp(ICmpInst::ICMP_ULT, ResBlock.PhiSrc1,589ResBlock.PhiSrc2);590591Value *Res =592Builder.CreateSelect(Cmp, ConstantInt::get(Builder.getInt32Ty(), -1),593ConstantInt::get(Builder.getInt32Ty(), 1));594595PhiRes->addIncoming(Res, ResBlock.BB);596BranchInst *NewBr = BranchInst::Create(EndBlock);597Builder.Insert(NewBr);598if (DTU)599DTU->applyUpdates({{DominatorTree::Insert, ResBlock.BB, EndBlock}});600}601602void MemCmpExpansion::setupResultBlockPHINodes() {603Type *MaxLoadType = IntegerType::get(CI->getContext(), MaxLoadSize * 8);604Builder.SetInsertPoint(ResBlock.BB);605// Note: this assumes one load per block.606ResBlock.PhiSrc1 =607Builder.CreatePHI(MaxLoadType, NumLoadsNonOneByte, "phi.src1");608ResBlock.PhiSrc2 =609Builder.CreatePHI(MaxLoadType, NumLoadsNonOneByte, "phi.src2");610}611612void MemCmpExpansion::setupEndBlockPHINodes() {613Builder.SetInsertPoint(EndBlock, EndBlock->begin());614PhiRes = Builder.CreatePHI(Type::getInt32Ty(CI->getContext()), 2, "phi.res");615}616617Value *MemCmpExpansion::getMemCmpExpansionZeroCase() {618unsigned LoadIndex = 0;619// This loop populates each of the LoadCmpBlocks with the IR sequence to620// handle multiple loads per block.621for (unsigned I = 0; I < getNumBlocks(); ++I) {622emitLoadCompareBlockMultipleLoads(I, LoadIndex);623}624625emitMemCmpResultBlock();626return PhiRes;627}628629/// A memcmp expansion that compares equality with 0 and only has one block of630/// load and compare can bypass the compare, branch, and phi IR that is required631/// in the general case.632Value *MemCmpExpansion::getMemCmpEqZeroOneBlock() {633unsigned LoadIndex = 0;634Value *Cmp = getCompareLoadPairs(0, LoadIndex);635assert(LoadIndex == getNumLoads() && "some entries were not consumed");636return Builder.CreateZExt(Cmp, Type::getInt32Ty(CI->getContext()));637}638639/// A memcmp expansion that only has one block of load and compare can bypass640/// the compare, branch, and phi IR that is required in the general case.641/// This function also analyses users of memcmp, and if there is only one user642/// from which we can conclude that only 2 out of 3 memcmp outcomes really643/// matter, then it generates more efficient code with only one comparison.644Value *MemCmpExpansion::getMemCmpOneBlock() {645bool NeedsBSwap = DL.isLittleEndian() && Size != 1;646Type *LoadSizeType = IntegerType::get(CI->getContext(), Size * 8);647Type *BSwapSizeType =648NeedsBSwap ? IntegerType::get(CI->getContext(), PowerOf2Ceil(Size * 8))649: nullptr;650Type *MaxLoadType =651IntegerType::get(CI->getContext(),652std::max(MaxLoadSize, (unsigned)PowerOf2Ceil(Size)) * 8);653654// The i8 and i16 cases don't need compares. We zext the loaded values and655// subtract them to get the suitable negative, zero, or positive i32 result.656if (Size == 1 || Size == 2) {657const LoadPair Loads = getLoadPair(LoadSizeType, BSwapSizeType,658Builder.getInt32Ty(), /*Offset*/ 0);659return Builder.CreateSub(Loads.Lhs, Loads.Rhs);660}661662const LoadPair Loads = getLoadPair(LoadSizeType, BSwapSizeType, MaxLoadType,663/*Offset*/ 0);664665// If a user of memcmp cares only about two outcomes, for example:666// bool result = memcmp(a, b, NBYTES) > 0;667// We can generate more optimal code with a smaller number of operations668if (CI->hasOneUser()) {669auto *UI = cast<Instruction>(*CI->user_begin());670ICmpInst::Predicate Pred = ICmpInst::Predicate::BAD_ICMP_PREDICATE;671uint64_t Shift;672bool NeedsZExt = false;673// This is a special case because instead of checking if the result is less674// than zero:675// bool result = memcmp(a, b, NBYTES) < 0;676// Compiler is clever enough to generate the following code:677// bool result = memcmp(a, b, NBYTES) >> 31;678if (match(UI, m_LShr(m_Value(), m_ConstantInt(Shift))) &&679Shift == (CI->getType()->getIntegerBitWidth() - 1)) {680Pred = ICmpInst::ICMP_SLT;681NeedsZExt = true;682} else {683// In case of a successful match this call will set `Pred` variable684match(UI, m_ICmp(Pred, m_Specific(CI), m_Zero()));685}686// Generate new code and remove the original memcmp call and the user687if (ICmpInst::isSigned(Pred)) {688Value *Cmp = Builder.CreateICmp(CmpInst::getUnsignedPredicate(Pred),689Loads.Lhs, Loads.Rhs);690auto *Result = NeedsZExt ? Builder.CreateZExt(Cmp, UI->getType()) : Cmp;691UI->replaceAllUsesWith(Result);692UI->eraseFromParent();693CI->eraseFromParent();694return nullptr;695}696}697698// The result of memcmp is negative, zero, or positive, so produce that by699// subtracting 2 extended compare bits: sub (ugt, ult).700// If a target prefers to use selects to get -1/0/1, they should be able701// to transform this later. The inverse transform (going from selects to math)702// may not be possible in the DAG because the selects got converted into703// branches before we got there.704Value *CmpUGT = Builder.CreateICmpUGT(Loads.Lhs, Loads.Rhs);705Value *CmpULT = Builder.CreateICmpULT(Loads.Lhs, Loads.Rhs);706Value *ZextUGT = Builder.CreateZExt(CmpUGT, Builder.getInt32Ty());707Value *ZextULT = Builder.CreateZExt(CmpULT, Builder.getInt32Ty());708return Builder.CreateSub(ZextUGT, ZextULT);709}710711// This function expands the memcmp call into an inline expansion and returns712// the memcmp result. Returns nullptr if the memcmp is already replaced.713Value *MemCmpExpansion::getMemCmpExpansion() {714// Create the basic block framework for a multi-block expansion.715if (getNumBlocks() != 1) {716BasicBlock *StartBlock = CI->getParent();717EndBlock = SplitBlock(StartBlock, CI, DTU, /*LI=*/nullptr,718/*MSSAU=*/nullptr, "endblock");719setupEndBlockPHINodes();720createResultBlock();721722// If return value of memcmp is not used in a zero equality, we need to723// calculate which source was larger. The calculation requires the724// two loaded source values of each load compare block.725// These will be saved in the phi nodes created by setupResultBlockPHINodes.726if (!IsUsedForZeroCmp) setupResultBlockPHINodes();727728// Create the number of required load compare basic blocks.729createLoadCmpBlocks();730731// Update the terminator added by SplitBlock to branch to the first732// LoadCmpBlock.733StartBlock->getTerminator()->setSuccessor(0, LoadCmpBlocks[0]);734if (DTU)735DTU->applyUpdates({{DominatorTree::Insert, StartBlock, LoadCmpBlocks[0]},736{DominatorTree::Delete, StartBlock, EndBlock}});737}738739Builder.SetCurrentDebugLocation(CI->getDebugLoc());740741if (IsUsedForZeroCmp)742return getNumBlocks() == 1 ? getMemCmpEqZeroOneBlock()743: getMemCmpExpansionZeroCase();744745if (getNumBlocks() == 1)746return getMemCmpOneBlock();747748for (unsigned I = 0; I < getNumBlocks(); ++I) {749emitLoadCompareBlock(I);750}751752emitMemCmpResultBlock();753return PhiRes;754}755756// This function checks to see if an expansion of memcmp can be generated.757// It checks for constant compare size that is less than the max inline size.758// If an expansion cannot occur, returns false to leave as a library call.759// Otherwise, the library call is replaced with a new IR instruction sequence.760/// We want to transform:761/// %call = call signext i32 @memcmp(i8* %0, i8* %1, i64 15)762/// To:763/// loadbb:764/// %0 = bitcast i32* %buffer2 to i8*765/// %1 = bitcast i32* %buffer1 to i8*766/// %2 = bitcast i8* %1 to i64*767/// %3 = bitcast i8* %0 to i64*768/// %4 = load i64, i64* %2769/// %5 = load i64, i64* %3770/// %6 = call i64 @llvm.bswap.i64(i64 %4)771/// %7 = call i64 @llvm.bswap.i64(i64 %5)772/// %8 = sub i64 %6, %7773/// %9 = icmp ne i64 %8, 0774/// br i1 %9, label %res_block, label %loadbb1775/// res_block: ; preds = %loadbb2,776/// %loadbb1, %loadbb777/// %phi.src1 = phi i64 [ %6, %loadbb ], [ %22, %loadbb1 ], [ %36, %loadbb2 ]778/// %phi.src2 = phi i64 [ %7, %loadbb ], [ %23, %loadbb1 ], [ %37, %loadbb2 ]779/// %10 = icmp ult i64 %phi.src1, %phi.src2780/// %11 = select i1 %10, i32 -1, i32 1781/// br label %endblock782/// loadbb1: ; preds = %loadbb783/// %12 = bitcast i32* %buffer2 to i8*784/// %13 = bitcast i32* %buffer1 to i8*785/// %14 = bitcast i8* %13 to i32*786/// %15 = bitcast i8* %12 to i32*787/// %16 = getelementptr i32, i32* %14, i32 2788/// %17 = getelementptr i32, i32* %15, i32 2789/// %18 = load i32, i32* %16790/// %19 = load i32, i32* %17791/// %20 = call i32 @llvm.bswap.i32(i32 %18)792/// %21 = call i32 @llvm.bswap.i32(i32 %19)793/// %22 = zext i32 %20 to i64794/// %23 = zext i32 %21 to i64795/// %24 = sub i64 %22, %23796/// %25 = icmp ne i64 %24, 0797/// br i1 %25, label %res_block, label %loadbb2798/// loadbb2: ; preds = %loadbb1799/// %26 = bitcast i32* %buffer2 to i8*800/// %27 = bitcast i32* %buffer1 to i8*801/// %28 = bitcast i8* %27 to i16*802/// %29 = bitcast i8* %26 to i16*803/// %30 = getelementptr i16, i16* %28, i16 6804/// %31 = getelementptr i16, i16* %29, i16 6805/// %32 = load i16, i16* %30806/// %33 = load i16, i16* %31807/// %34 = call i16 @llvm.bswap.i16(i16 %32)808/// %35 = call i16 @llvm.bswap.i16(i16 %33)809/// %36 = zext i16 %34 to i64810/// %37 = zext i16 %35 to i64811/// %38 = sub i64 %36, %37812/// %39 = icmp ne i64 %38, 0813/// br i1 %39, label %res_block, label %loadbb3814/// loadbb3: ; preds = %loadbb2815/// %40 = bitcast i32* %buffer2 to i8*816/// %41 = bitcast i32* %buffer1 to i8*817/// %42 = getelementptr i8, i8* %41, i8 14818/// %43 = getelementptr i8, i8* %40, i8 14819/// %44 = load i8, i8* %42820/// %45 = load i8, i8* %43821/// %46 = zext i8 %44 to i32822/// %47 = zext i8 %45 to i32823/// %48 = sub i32 %46, %47824/// br label %endblock825/// endblock: ; preds = %res_block,826/// %loadbb3827/// %phi.res = phi i32 [ %48, %loadbb3 ], [ %11, %res_block ]828/// ret i32 %phi.res829static bool expandMemCmp(CallInst *CI, const TargetTransformInfo *TTI,830const TargetLowering *TLI, const DataLayout *DL,831ProfileSummaryInfo *PSI, BlockFrequencyInfo *BFI,832DomTreeUpdater *DTU, const bool IsBCmp) {833NumMemCmpCalls++;834835// Early exit from expansion if -Oz.836if (CI->getFunction()->hasMinSize())837return false;838839// Early exit from expansion if size is not a constant.840ConstantInt *SizeCast = dyn_cast<ConstantInt>(CI->getArgOperand(2));841if (!SizeCast) {842NumMemCmpNotConstant++;843return false;844}845const uint64_t SizeVal = SizeCast->getZExtValue();846847if (SizeVal == 0) {848return false;849}850// TTI call to check if target would like to expand memcmp. Also, get the851// available load sizes.852const bool IsUsedForZeroCmp =853IsBCmp || isOnlyUsedInZeroEqualityComparison(CI);854bool OptForSize = CI->getFunction()->hasOptSize() ||855llvm::shouldOptimizeForSize(CI->getParent(), PSI, BFI);856auto Options = TTI->enableMemCmpExpansion(OptForSize,857IsUsedForZeroCmp);858if (!Options) return false;859860if (MemCmpEqZeroNumLoadsPerBlock.getNumOccurrences())861Options.NumLoadsPerBlock = MemCmpEqZeroNumLoadsPerBlock;862863if (OptForSize &&864MaxLoadsPerMemcmpOptSize.getNumOccurrences())865Options.MaxNumLoads = MaxLoadsPerMemcmpOptSize;866867if (!OptForSize && MaxLoadsPerMemcmp.getNumOccurrences())868Options.MaxNumLoads = MaxLoadsPerMemcmp;869870MemCmpExpansion Expansion(CI, SizeVal, Options, IsUsedForZeroCmp, *DL, DTU);871872// Don't expand if this will require more loads than desired by the target.873if (Expansion.getNumLoads() == 0) {874NumMemCmpGreaterThanMax++;875return false;876}877878NumMemCmpInlined++;879880if (Value *Res = Expansion.getMemCmpExpansion()) {881// Replace call with result of expansion and erase call.882CI->replaceAllUsesWith(Res);883CI->eraseFromParent();884}885886return true;887}888889// Returns true if a change was made.890static bool runOnBlock(BasicBlock &BB, const TargetLibraryInfo *TLI,891const TargetTransformInfo *TTI, const TargetLowering *TL,892const DataLayout &DL, ProfileSummaryInfo *PSI,893BlockFrequencyInfo *BFI, DomTreeUpdater *DTU);894895static PreservedAnalyses runImpl(Function &F, const TargetLibraryInfo *TLI,896const TargetTransformInfo *TTI,897const TargetLowering *TL,898ProfileSummaryInfo *PSI,899BlockFrequencyInfo *BFI, DominatorTree *DT);900901class ExpandMemCmpLegacyPass : public FunctionPass {902public:903static char ID;904905ExpandMemCmpLegacyPass() : FunctionPass(ID) {906initializeExpandMemCmpLegacyPassPass(*PassRegistry::getPassRegistry());907}908909bool runOnFunction(Function &F) override {910if (skipFunction(F)) return false;911912auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();913if (!TPC) {914return false;915}916const TargetLowering* TL =917TPC->getTM<TargetMachine>().getSubtargetImpl(F)->getTargetLowering();918919const TargetLibraryInfo *TLI =920&getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);921const TargetTransformInfo *TTI =922&getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);923auto *PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();924auto *BFI = (PSI && PSI->hasProfileSummary()) ?925&getAnalysis<LazyBlockFrequencyInfoPass>().getBFI() :926nullptr;927DominatorTree *DT = nullptr;928if (auto *DTWP = getAnalysisIfAvailable<DominatorTreeWrapperPass>())929DT = &DTWP->getDomTree();930auto PA = runImpl(F, TLI, TTI, TL, PSI, BFI, DT);931return !PA.areAllPreserved();932}933934private:935void getAnalysisUsage(AnalysisUsage &AU) const override {936AU.addRequired<TargetLibraryInfoWrapperPass>();937AU.addRequired<TargetTransformInfoWrapperPass>();938AU.addRequired<ProfileSummaryInfoWrapperPass>();939AU.addPreserved<DominatorTreeWrapperPass>();940LazyBlockFrequencyInfoPass::getLazyBFIAnalysisUsage(AU);941FunctionPass::getAnalysisUsage(AU);942}943};944945bool runOnBlock(BasicBlock &BB, const TargetLibraryInfo *TLI,946const TargetTransformInfo *TTI, const TargetLowering *TL,947const DataLayout &DL, ProfileSummaryInfo *PSI,948BlockFrequencyInfo *BFI, DomTreeUpdater *DTU) {949for (Instruction &I : BB) {950CallInst *CI = dyn_cast<CallInst>(&I);951if (!CI) {952continue;953}954LibFunc Func;955if (TLI->getLibFunc(*CI, Func) &&956(Func == LibFunc_memcmp || Func == LibFunc_bcmp) &&957expandMemCmp(CI, TTI, TL, &DL, PSI, BFI, DTU, Func == LibFunc_bcmp)) {958return true;959}960}961return false;962}963964PreservedAnalyses runImpl(Function &F, const TargetLibraryInfo *TLI,965const TargetTransformInfo *TTI,966const TargetLowering *TL, ProfileSummaryInfo *PSI,967BlockFrequencyInfo *BFI, DominatorTree *DT) {968std::optional<DomTreeUpdater> DTU;969if (DT)970DTU.emplace(DT, DomTreeUpdater::UpdateStrategy::Lazy);971972const DataLayout& DL = F.getDataLayout();973bool MadeChanges = false;974for (auto BBIt = F.begin(); BBIt != F.end();) {975if (runOnBlock(*BBIt, TLI, TTI, TL, DL, PSI, BFI, DTU ? &*DTU : nullptr)) {976MadeChanges = true;977// If changes were made, restart the function from the beginning, since978// the structure of the function was changed.979BBIt = F.begin();980} else {981++BBIt;982}983}984if (MadeChanges)985for (BasicBlock &BB : F)986SimplifyInstructionsInBlock(&BB);987if (!MadeChanges)988return PreservedAnalyses::all();989PreservedAnalyses PA;990PA.preserve<DominatorTreeAnalysis>();991return PA;992}993994} // namespace995996PreservedAnalyses ExpandMemCmpPass::run(Function &F,997FunctionAnalysisManager &FAM) {998const auto *TL = TM->getSubtargetImpl(F)->getTargetLowering();999const auto &TLI = FAM.getResult<TargetLibraryAnalysis>(F);1000const auto &TTI = FAM.getResult<TargetIRAnalysis>(F);1001auto *PSI = FAM.getResult<ModuleAnalysisManagerFunctionProxy>(F)1002.getCachedResult<ProfileSummaryAnalysis>(*F.getParent());1003BlockFrequencyInfo *BFI = (PSI && PSI->hasProfileSummary())1004? &FAM.getResult<BlockFrequencyAnalysis>(F)1005: nullptr;1006auto *DT = FAM.getCachedResult<DominatorTreeAnalysis>(F);10071008return runImpl(F, &TLI, &TTI, TL, PSI, BFI, DT);1009}10101011char ExpandMemCmpLegacyPass::ID = 0;1012INITIALIZE_PASS_BEGIN(ExpandMemCmpLegacyPass, DEBUG_TYPE,1013"Expand memcmp() to load/stores", false, false)1014INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)1015INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)1016INITIALIZE_PASS_DEPENDENCY(LazyBlockFrequencyInfoPass)1017INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass)1018INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)1019INITIALIZE_PASS_END(ExpandMemCmpLegacyPass, DEBUG_TYPE,1020"Expand memcmp() to load/stores", false, false)10211022FunctionPass *llvm::createExpandMemCmpLegacyPass() {1023return new ExpandMemCmpLegacyPass();1024}102510261027