Path: blob/main/contrib/llvm-project/compiler-rt/lib/fuzzer/FuzzerCorpus.h
35262 views
//===- FuzzerCorpus.h - Internal header for the Fuzzer ----------*- C++ -* ===//1//2// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.3// See https://llvm.org/LICENSE.txt for license information.4// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception5//6//===----------------------------------------------------------------------===//7// fuzzer::InputCorpus8//===----------------------------------------------------------------------===//910#ifndef LLVM_FUZZER_CORPUS11#define LLVM_FUZZER_CORPUS1213#include "FuzzerDataFlowTrace.h"14#include "FuzzerDefs.h"15#include "FuzzerIO.h"16#include "FuzzerRandom.h"17#include "FuzzerSHA1.h"18#include "FuzzerTracePC.h"19#include <algorithm>20#include <bitset>21#include <chrono>22#include <numeric>23#include <random>24#include <unordered_set>2526namespace fuzzer {2728struct InputInfo {29Unit U; // The actual input data.30std::chrono::microseconds TimeOfUnit;31uint8_t Sha1[kSHA1NumBytes]; // Checksum.32// Number of features that this input has and no smaller input has.33size_t NumFeatures = 0;34size_t Tmp = 0; // Used by ValidateFeatureSet.35// Stats.36size_t NumExecutedMutations = 0;37size_t NumSuccessfullMutations = 0;38bool NeverReduce = false;39bool MayDeleteFile = false;40bool Reduced = false;41bool HasFocusFunction = false;42std::vector<uint32_t> UniqFeatureSet;43std::vector<uint8_t> DataFlowTraceForFocusFunction;44// Power schedule.45bool NeedsEnergyUpdate = false;46double Energy = 0.0;47double SumIncidence = 0.0;48std::vector<std::pair<uint32_t, uint16_t>> FeatureFreqs;4950// Delete feature Idx and its frequency from FeatureFreqs.51bool DeleteFeatureFreq(uint32_t Idx) {52if (FeatureFreqs.empty())53return false;5455// Binary search over local feature frequencies sorted by index.56auto Lower = std::lower_bound(FeatureFreqs.begin(), FeatureFreqs.end(),57std::pair<uint32_t, uint16_t>(Idx, 0));5859if (Lower != FeatureFreqs.end() && Lower->first == Idx) {60FeatureFreqs.erase(Lower);61return true;62}63return false;64}6566// Assign more energy to a high-entropy seed, i.e., that reveals more67// information about the globally rare features in the neighborhood of the68// seed. Since we do not know the entropy of a seed that has never been69// executed we assign fresh seeds maximum entropy and let II->Energy approach70// the true entropy from above. If ScalePerExecTime is true, the computed71// entropy is scaled based on how fast this input executes compared to the72// average execution time of inputs. The faster an input executes, the more73// energy gets assigned to the input.74void UpdateEnergy(size_t GlobalNumberOfFeatures, bool ScalePerExecTime,75std::chrono::microseconds AverageUnitExecutionTime) {76Energy = 0.0;77SumIncidence = 0.0;7879// Apply add-one smoothing to locally discovered features.80for (const auto &F : FeatureFreqs) {81double LocalIncidence = F.second + 1;82Energy -= LocalIncidence * log(LocalIncidence);83SumIncidence += LocalIncidence;84}8586// Apply add-one smoothing to locally undiscovered features.87// PreciseEnergy -= 0; // since log(1.0) == 0)88SumIncidence +=89static_cast<double>(GlobalNumberOfFeatures - FeatureFreqs.size());9091// Add a single locally abundant feature apply add-one smoothing.92double AbdIncidence = static_cast<double>(NumExecutedMutations + 1);93Energy -= AbdIncidence * log(AbdIncidence);94SumIncidence += AbdIncidence;9596// Normalize.97if (SumIncidence != 0)98Energy = Energy / SumIncidence + log(SumIncidence);99100if (ScalePerExecTime) {101// Scaling to favor inputs with lower execution time.102uint32_t PerfScore = 100;103if (TimeOfUnit.count() > AverageUnitExecutionTime.count() * 10)104PerfScore = 10;105else if (TimeOfUnit.count() > AverageUnitExecutionTime.count() * 4)106PerfScore = 25;107else if (TimeOfUnit.count() > AverageUnitExecutionTime.count() * 2)108PerfScore = 50;109else if (TimeOfUnit.count() * 3 > AverageUnitExecutionTime.count() * 4)110PerfScore = 75;111else if (TimeOfUnit.count() * 4 < AverageUnitExecutionTime.count())112PerfScore = 300;113else if (TimeOfUnit.count() * 3 < AverageUnitExecutionTime.count())114PerfScore = 200;115else if (TimeOfUnit.count() * 2 < AverageUnitExecutionTime.count())116PerfScore = 150;117118Energy *= PerfScore;119}120}121122// Increment the frequency of the feature Idx.123void UpdateFeatureFrequency(uint32_t Idx) {124NeedsEnergyUpdate = true;125126// The local feature frequencies is an ordered vector of pairs.127// If there are no local feature frequencies, push_back preserves order.128// Set the feature frequency for feature Idx32 to 1.129if (FeatureFreqs.empty()) {130FeatureFreqs.push_back(std::pair<uint32_t, uint16_t>(Idx, 1));131return;132}133134// Binary search over local feature frequencies sorted by index.135auto Lower = std::lower_bound(FeatureFreqs.begin(), FeatureFreqs.end(),136std::pair<uint32_t, uint16_t>(Idx, 0));137138// If feature Idx32 already exists, increment its frequency.139// Otherwise, insert a new pair right after the next lower index.140if (Lower != FeatureFreqs.end() && Lower->first == Idx) {141Lower->second++;142} else {143FeatureFreqs.insert(Lower, std::pair<uint32_t, uint16_t>(Idx, 1));144}145}146};147148struct EntropicOptions {149bool Enabled;150size_t NumberOfRarestFeatures;151size_t FeatureFrequencyThreshold;152bool ScalePerExecTime;153};154155class InputCorpus {156static const uint32_t kFeatureSetSize = 1 << 21;157static const uint8_t kMaxMutationFactor = 20;158static const size_t kSparseEnergyUpdates = 100;159160size_t NumExecutedMutations = 0;161162EntropicOptions Entropic;163164public:165InputCorpus(const std::string &OutputCorpus, EntropicOptions Entropic)166: Entropic(Entropic), OutputCorpus(OutputCorpus) {167memset(InputSizesPerFeature, 0, sizeof(InputSizesPerFeature));168memset(SmallestElementPerFeature, 0, sizeof(SmallestElementPerFeature));169}170~InputCorpus() {171for (auto II : Inputs)172delete II;173}174size_t size() const { return Inputs.size(); }175size_t SizeInBytes() const {176size_t Res = 0;177for (auto II : Inputs)178Res += II->U.size();179return Res;180}181size_t NumActiveUnits() const {182size_t Res = 0;183for (auto II : Inputs)184Res += !II->U.empty();185return Res;186}187size_t MaxInputSize() const {188size_t Res = 0;189for (auto II : Inputs)190Res = std::max(Res, II->U.size());191return Res;192}193void IncrementNumExecutedMutations() { NumExecutedMutations++; }194195size_t NumInputsThatTouchFocusFunction() {196return std::count_if(Inputs.begin(), Inputs.end(), [](const InputInfo *II) {197return II->HasFocusFunction;198});199}200201size_t NumInputsWithDataFlowTrace() {202return std::count_if(Inputs.begin(), Inputs.end(), [](const InputInfo *II) {203return !II->DataFlowTraceForFocusFunction.empty();204});205}206207bool empty() const { return Inputs.empty(); }208const Unit &operator[] (size_t Idx) const { return Inputs[Idx]->U; }209InputInfo *AddToCorpus(const Unit &U, size_t NumFeatures, bool MayDeleteFile,210bool HasFocusFunction, bool NeverReduce,211std::chrono::microseconds TimeOfUnit,212const std::vector<uint32_t> &FeatureSet,213const DataFlowTrace &DFT, const InputInfo *BaseII) {214assert(!U.empty());215if (FeatureDebug)216Printf("ADD_TO_CORPUS %zd NF %zd\n", Inputs.size(), NumFeatures);217// Inputs.size() is cast to uint32_t below.218assert(Inputs.size() < std::numeric_limits<uint32_t>::max());219Inputs.push_back(new InputInfo());220InputInfo &II = *Inputs.back();221II.U = U;222II.NumFeatures = NumFeatures;223II.NeverReduce = NeverReduce;224II.TimeOfUnit = TimeOfUnit;225II.MayDeleteFile = MayDeleteFile;226II.UniqFeatureSet = FeatureSet;227II.HasFocusFunction = HasFocusFunction;228// Assign maximal energy to the new seed.229II.Energy = RareFeatures.empty() ? 1.0 : log(RareFeatures.size());230II.SumIncidence = static_cast<double>(RareFeatures.size());231II.NeedsEnergyUpdate = false;232std::sort(II.UniqFeatureSet.begin(), II.UniqFeatureSet.end());233ComputeSHA1(U.data(), U.size(), II.Sha1);234auto Sha1Str = Sha1ToString(II.Sha1);235Hashes.insert(Sha1Str);236if (HasFocusFunction)237if (auto V = DFT.Get(Sha1Str))238II.DataFlowTraceForFocusFunction = *V;239// This is a gross heuristic.240// Ideally, when we add an element to a corpus we need to know its DFT.241// But if we don't, we'll use the DFT of its base input.242if (II.DataFlowTraceForFocusFunction.empty() && BaseII)243II.DataFlowTraceForFocusFunction = BaseII->DataFlowTraceForFocusFunction;244DistributionNeedsUpdate = true;245PrintCorpus();246// ValidateFeatureSet();247return &II;248}249250// Debug-only251void PrintUnit(const Unit &U) {252if (!FeatureDebug) return;253for (uint8_t C : U) {254if (C != 'F' && C != 'U' && C != 'Z')255C = '.';256Printf("%c", C);257}258}259260// Debug-only261void PrintFeatureSet(const std::vector<uint32_t> &FeatureSet) {262if (!FeatureDebug) return;263Printf("{");264for (uint32_t Feature: FeatureSet)265Printf("%u,", Feature);266Printf("}");267}268269// Debug-only270void PrintCorpus() {271if (!FeatureDebug) return;272Printf("======= CORPUS:\n");273int i = 0;274for (auto II : Inputs) {275if (std::find(II->U.begin(), II->U.end(), 'F') != II->U.end()) {276Printf("[%2d] ", i);277Printf("%s sz=%zd ", Sha1ToString(II->Sha1).c_str(), II->U.size());278PrintUnit(II->U);279Printf(" ");280PrintFeatureSet(II->UniqFeatureSet);281Printf("\n");282}283i++;284}285}286287void Replace(InputInfo *II, const Unit &U,288std::chrono::microseconds TimeOfUnit) {289assert(II->U.size() > U.size());290Hashes.erase(Sha1ToString(II->Sha1));291DeleteFile(*II);292ComputeSHA1(U.data(), U.size(), II->Sha1);293Hashes.insert(Sha1ToString(II->Sha1));294II->U = U;295II->Reduced = true;296II->TimeOfUnit = TimeOfUnit;297DistributionNeedsUpdate = true;298}299300bool HasUnit(const Unit &U) { return Hashes.count(Hash(U)); }301bool HasUnit(const std::string &H) { return Hashes.count(H); }302InputInfo &ChooseUnitToMutate(Random &Rand) {303InputInfo &II = *Inputs[ChooseUnitIdxToMutate(Rand)];304assert(!II.U.empty());305return II;306}307308InputInfo &ChooseUnitToCrossOverWith(Random &Rand, bool UniformDist) {309if (!UniformDist) {310return ChooseUnitToMutate(Rand);311}312InputInfo &II = *Inputs[Rand(Inputs.size())];313assert(!II.U.empty());314return II;315}316317// Returns an index of random unit from the corpus to mutate.318size_t ChooseUnitIdxToMutate(Random &Rand) {319UpdateCorpusDistribution(Rand);320size_t Idx = static_cast<size_t>(CorpusDistribution(Rand));321assert(Idx < Inputs.size());322return Idx;323}324325void PrintStats() {326for (size_t i = 0; i < Inputs.size(); i++) {327const auto &II = *Inputs[i];328Printf(" [% 3zd %s] sz: % 5zd runs: % 5zd succ: % 5zd focus: %d\n", i,329Sha1ToString(II.Sha1).c_str(), II.U.size(),330II.NumExecutedMutations, II.NumSuccessfullMutations,331II.HasFocusFunction);332}333}334335void PrintFeatureSet() {336for (size_t i = 0; i < kFeatureSetSize; i++) {337if(size_t Sz = GetFeature(i))338Printf("[%zd: id %zd sz%zd] ", i, SmallestElementPerFeature[i], Sz);339}340Printf("\n\t");341for (size_t i = 0; i < Inputs.size(); i++)342if (size_t N = Inputs[i]->NumFeatures)343Printf(" %zd=>%zd ", i, N);344Printf("\n");345}346347void DeleteFile(const InputInfo &II) {348if (!OutputCorpus.empty() && II.MayDeleteFile)349RemoveFile(DirPlusFile(OutputCorpus, Sha1ToString(II.Sha1)));350}351352void DeleteInput(size_t Idx) {353InputInfo &II = *Inputs[Idx];354DeleteFile(II);355Unit().swap(II.U);356II.Energy = 0.0;357II.NeedsEnergyUpdate = false;358DistributionNeedsUpdate = true;359if (FeatureDebug)360Printf("EVICTED %zd\n", Idx);361}362363void AddRareFeature(uint32_t Idx) {364// Maintain *at least* TopXRarestFeatures many rare features365// and all features with a frequency below ConsideredRare.366// Remove all other features.367while (RareFeatures.size() > Entropic.NumberOfRarestFeatures &&368FreqOfMostAbundantRareFeature > Entropic.FeatureFrequencyThreshold) {369370// Find most and second most abbundant feature.371uint32_t MostAbundantRareFeatureIndices[2] = {RareFeatures[0],372RareFeatures[0]};373size_t Delete = 0;374for (size_t i = 0; i < RareFeatures.size(); i++) {375uint32_t Idx2 = RareFeatures[i];376if (GlobalFeatureFreqs[Idx2] >=377GlobalFeatureFreqs[MostAbundantRareFeatureIndices[0]]) {378MostAbundantRareFeatureIndices[1] = MostAbundantRareFeatureIndices[0];379MostAbundantRareFeatureIndices[0] = Idx2;380Delete = i;381}382}383384// Remove most abundant rare feature.385IsRareFeature[Delete] = false;386RareFeatures[Delete] = RareFeatures.back();387RareFeatures.pop_back();388389for (auto II : Inputs) {390if (II->DeleteFeatureFreq(MostAbundantRareFeatureIndices[0]))391II->NeedsEnergyUpdate = true;392}393394// Set 2nd most abundant as the new most abundant feature count.395FreqOfMostAbundantRareFeature =396GlobalFeatureFreqs[MostAbundantRareFeatureIndices[1]];397}398399// Add rare feature, handle collisions, and update energy.400RareFeatures.push_back(Idx);401IsRareFeature[Idx] = true;402GlobalFeatureFreqs[Idx] = 0;403for (auto II : Inputs) {404II->DeleteFeatureFreq(Idx);405406// Apply add-one smoothing to this locally undiscovered feature.407// Zero energy seeds will never be fuzzed and remain zero energy.408if (II->Energy > 0.0) {409II->SumIncidence += 1;410II->Energy += log(II->SumIncidence) / II->SumIncidence;411}412}413414DistributionNeedsUpdate = true;415}416417bool AddFeature(size_t Idx, uint32_t NewSize, bool Shrink) {418assert(NewSize);419Idx = Idx % kFeatureSetSize;420uint32_t OldSize = GetFeature(Idx);421if (OldSize == 0 || (Shrink && OldSize > NewSize)) {422if (OldSize > 0) {423size_t OldIdx = SmallestElementPerFeature[Idx];424InputInfo &II = *Inputs[OldIdx];425assert(II.NumFeatures > 0);426II.NumFeatures--;427if (II.NumFeatures == 0)428DeleteInput(OldIdx);429} else {430NumAddedFeatures++;431if (Entropic.Enabled)432AddRareFeature((uint32_t)Idx);433}434NumUpdatedFeatures++;435if (FeatureDebug)436Printf("ADD FEATURE %zd sz %d\n", Idx, NewSize);437// Inputs.size() is guaranteed to be less than UINT32_MAX by AddToCorpus.438SmallestElementPerFeature[Idx] = static_cast<uint32_t>(Inputs.size());439InputSizesPerFeature[Idx] = NewSize;440return true;441}442return false;443}444445// Increment frequency of feature Idx globally and locally.446void UpdateFeatureFrequency(InputInfo *II, size_t Idx) {447uint32_t Idx32 = Idx % kFeatureSetSize;448449// Saturated increment.450if (GlobalFeatureFreqs[Idx32] == 0xFFFF)451return;452uint16_t Freq = GlobalFeatureFreqs[Idx32]++;453454// Skip if abundant.455if (Freq > FreqOfMostAbundantRareFeature || !IsRareFeature[Idx32])456return;457458// Update global frequencies.459if (Freq == FreqOfMostAbundantRareFeature)460FreqOfMostAbundantRareFeature++;461462// Update local frequencies.463if (II)464II->UpdateFeatureFrequency(Idx32);465}466467size_t NumFeatures() const { return NumAddedFeatures; }468size_t NumFeatureUpdates() const { return NumUpdatedFeatures; }469470private:471472static const bool FeatureDebug = false;473474uint32_t GetFeature(size_t Idx) const { return InputSizesPerFeature[Idx]; }475476void ValidateFeatureSet() {477if (FeatureDebug)478PrintFeatureSet();479for (size_t Idx = 0; Idx < kFeatureSetSize; Idx++)480if (GetFeature(Idx))481Inputs[SmallestElementPerFeature[Idx]]->Tmp++;482for (auto II: Inputs) {483if (II->Tmp != II->NumFeatures)484Printf("ZZZ %zd %zd\n", II->Tmp, II->NumFeatures);485assert(II->Tmp == II->NumFeatures);486II->Tmp = 0;487}488}489490// Updates the probability distribution for the units in the corpus.491// Must be called whenever the corpus or unit weights are changed.492//493// Hypothesis: inputs that maximize information about globally rare features494// are interesting.495void UpdateCorpusDistribution(Random &Rand) {496// Skip update if no seeds or rare features were added/deleted.497// Sparse updates for local change of feature frequencies,498// i.e., randomly do not skip.499if (!DistributionNeedsUpdate &&500(!Entropic.Enabled || Rand(kSparseEnergyUpdates)))501return;502503DistributionNeedsUpdate = false;504505size_t N = Inputs.size();506assert(N);507Intervals.resize(N + 1);508Weights.resize(N);509std::iota(Intervals.begin(), Intervals.end(), 0);510511std::chrono::microseconds AverageUnitExecutionTime(0);512for (auto II : Inputs) {513AverageUnitExecutionTime += II->TimeOfUnit;514}515AverageUnitExecutionTime /= N;516517bool VanillaSchedule = true;518if (Entropic.Enabled) {519for (auto II : Inputs) {520if (II->NeedsEnergyUpdate && II->Energy != 0.0) {521II->NeedsEnergyUpdate = false;522II->UpdateEnergy(RareFeatures.size(), Entropic.ScalePerExecTime,523AverageUnitExecutionTime);524}525}526527for (size_t i = 0; i < N; i++) {528529if (Inputs[i]->NumFeatures == 0) {530// If the seed doesn't represent any features, assign zero energy.531Weights[i] = 0.;532} else if (Inputs[i]->NumExecutedMutations / kMaxMutationFactor >533NumExecutedMutations / Inputs.size()) {534// If the seed was fuzzed a lot more than average, assign zero energy.535Weights[i] = 0.;536} else {537// Otherwise, simply assign the computed energy.538Weights[i] = Inputs[i]->Energy;539}540541// If energy for all seeds is zero, fall back to vanilla schedule.542if (Weights[i] > 0.0)543VanillaSchedule = false;544}545}546547if (VanillaSchedule) {548for (size_t i = 0; i < N; i++)549Weights[i] =550Inputs[i]->NumFeatures551? static_cast<double>((i + 1) *552(Inputs[i]->HasFocusFunction ? 1000 : 1))553: 0.;554}555556if (FeatureDebug) {557for (size_t i = 0; i < N; i++)558Printf("%zd ", Inputs[i]->NumFeatures);559Printf("SCORE\n");560for (size_t i = 0; i < N; i++)561Printf("%f ", Weights[i]);562Printf("Weights\n");563}564CorpusDistribution = std::piecewise_constant_distribution<double>(565Intervals.begin(), Intervals.end(), Weights.begin());566}567std::piecewise_constant_distribution<double> CorpusDistribution;568569std::vector<double> Intervals;570std::vector<double> Weights;571572std::unordered_set<std::string> Hashes;573std::vector<InputInfo *> Inputs;574575size_t NumAddedFeatures = 0;576size_t NumUpdatedFeatures = 0;577uint32_t InputSizesPerFeature[kFeatureSetSize];578uint32_t SmallestElementPerFeature[kFeatureSetSize];579580bool DistributionNeedsUpdate = true;581uint16_t FreqOfMostAbundantRareFeature = 0;582uint16_t GlobalFeatureFreqs[kFeatureSetSize] = {};583std::vector<uint32_t> RareFeatures;584std::bitset<kFeatureSetSize> IsRareFeature;585586std::string OutputCorpus;587};588589} // namespace fuzzer590591#endif // LLVM_FUZZER_CORPUS592593594