Path: blob/main/contrib/llvm-project/compiler-rt/lib/fuzzer/FuzzerMerge.cpp
35262 views
//===- FuzzerMerge.cpp - merging corpora ----------------------------------===//1//2// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.3// See https://llvm.org/LICENSE.txt for license information.4// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception5//6//===----------------------------------------------------------------------===//7// Merging corpora.8//===----------------------------------------------------------------------===//910#include "FuzzerCommand.h"11#include "FuzzerMerge.h"12#include "FuzzerIO.h"13#include "FuzzerInternal.h"14#include "FuzzerTracePC.h"15#include "FuzzerUtil.h"1617#include <fstream>18#include <iterator>19#include <set>20#include <sstream>21#include <unordered_set>2223namespace fuzzer {2425bool Merger::Parse(const std::string &Str, bool ParseCoverage) {26std::istringstream SS(Str);27return Parse(SS, ParseCoverage);28}2930void Merger::ParseOrExit(std::istream &IS, bool ParseCoverage) {31if (!Parse(IS, ParseCoverage)) {32Printf("MERGE: failed to parse the control file (unexpected error)\n");33exit(1);34}35}3637// The control file example:38//39// 3 # The number of inputs40// 1 # The number of inputs in the first corpus, <= the previous number41// file042// file143// file2 # One file name per line.44// STARTED 0 123 # FileID, file size45// FT 0 1 4 6 8 # FileID COV1 COV2 ...46// COV 0 7 8 9 # FileID COV1 COV147// STARTED 1 456 # If FT is missing, the input crashed while processing.48// STARTED 2 56749// FT 2 8 950// COV 2 11 1251bool Merger::Parse(std::istream &IS, bool ParseCoverage) {52LastFailure.clear();53std::string Line;5455// Parse NumFiles.56if (!std::getline(IS, Line, '\n')) return false;57std::istringstream L1(Line);58size_t NumFiles = 0;59L1 >> NumFiles;60if (NumFiles == 0 || NumFiles > 10000000) return false;6162// Parse NumFilesInFirstCorpus.63if (!std::getline(IS, Line, '\n')) return false;64std::istringstream L2(Line);65NumFilesInFirstCorpus = NumFiles + 1;66L2 >> NumFilesInFirstCorpus;67if (NumFilesInFirstCorpus > NumFiles) return false;6869// Parse file names.70Files.resize(NumFiles);71for (size_t i = 0; i < NumFiles; i++)72if (!std::getline(IS, Files[i].Name, '\n'))73return false;7475// Parse STARTED, FT, and COV lines.76size_t ExpectedStartMarker = 0;77const size_t kInvalidStartMarker = -1;78size_t LastSeenStartMarker = kInvalidStartMarker;79bool HaveFtMarker = true;80std::vector<uint32_t> TmpFeatures;81std::set<uint32_t> PCs;82while (std::getline(IS, Line, '\n')) {83std::istringstream ISS1(Line);84std::string Marker;85uint32_t N;86if (!(ISS1 >> Marker) || !(ISS1 >> N))87return false;88if (Marker == "STARTED") {89// STARTED FILE_ID FILE_SIZE90if (ExpectedStartMarker != N)91return false;92ISS1 >> Files[ExpectedStartMarker].Size;93LastSeenStartMarker = ExpectedStartMarker;94assert(ExpectedStartMarker < Files.size());95ExpectedStartMarker++;96HaveFtMarker = false;97} else if (Marker == "FT") {98// FT FILE_ID COV1 COV2 COV3 ...99size_t CurrentFileIdx = N;100if (CurrentFileIdx != LastSeenStartMarker)101return false;102HaveFtMarker = true;103if (ParseCoverage) {104TmpFeatures.clear(); // use a vector from outer scope to avoid resizes.105while (ISS1 >> N)106TmpFeatures.push_back(N);107std::sort(TmpFeatures.begin(), TmpFeatures.end());108Files[CurrentFileIdx].Features = TmpFeatures;109}110} else if (Marker == "COV") {111size_t CurrentFileIdx = N;112if (CurrentFileIdx != LastSeenStartMarker)113return false;114if (ParseCoverage)115while (ISS1 >> N)116if (PCs.insert(N).second)117Files[CurrentFileIdx].Cov.push_back(N);118} else {119return false;120}121}122if (!HaveFtMarker && LastSeenStartMarker != kInvalidStartMarker)123LastFailure = Files[LastSeenStartMarker].Name;124125FirstNotProcessedFile = ExpectedStartMarker;126return true;127}128129size_t Merger::ApproximateMemoryConsumption() const {130size_t Res = 0;131for (const auto &F: Files)132Res += sizeof(F) + F.Features.size() * sizeof(F.Features[0]);133return Res;134}135136// Decides which files need to be merged (add those to NewFiles).137// Returns the number of new features added.138size_t Merger::Merge(const std::set<uint32_t> &InitialFeatures,139std::set<uint32_t> *NewFeatures,140const std::set<uint32_t> &InitialCov,141std::set<uint32_t> *NewCov,142std::vector<std::string> *NewFiles) {143NewFiles->clear();144NewFeatures->clear();145NewCov->clear();146assert(NumFilesInFirstCorpus <= Files.size());147std::set<uint32_t> AllFeatures = InitialFeatures;148149// What features are in the initial corpus?150for (size_t i = 0; i < NumFilesInFirstCorpus; i++) {151auto &Cur = Files[i].Features;152AllFeatures.insert(Cur.begin(), Cur.end());153}154// Remove all features that we already know from all other inputs.155for (size_t i = NumFilesInFirstCorpus; i < Files.size(); i++) {156auto &Cur = Files[i].Features;157std::vector<uint32_t> Tmp;158std::set_difference(Cur.begin(), Cur.end(), AllFeatures.begin(),159AllFeatures.end(), std::inserter(Tmp, Tmp.begin()));160Cur.swap(Tmp);161}162163// Sort. Give preference to164// * smaller files165// * files with more features.166std::sort(Files.begin() + NumFilesInFirstCorpus, Files.end(),167[&](const MergeFileInfo &a, const MergeFileInfo &b) -> bool {168if (a.Size != b.Size)169return a.Size < b.Size;170return a.Features.size() > b.Features.size();171});172173// One greedy pass: add the file's features to AllFeatures.174// If new features were added, add this file to NewFiles.175for (size_t i = NumFilesInFirstCorpus; i < Files.size(); i++) {176auto &Cur = Files[i].Features;177// Printf("%s -> sz %zd ft %zd\n", Files[i].Name.c_str(),178// Files[i].Size, Cur.size());179bool FoundNewFeatures = false;180for (auto Fe: Cur) {181if (AllFeatures.insert(Fe).second) {182FoundNewFeatures = true;183NewFeatures->insert(Fe);184}185}186if (FoundNewFeatures)187NewFiles->push_back(Files[i].Name);188for (auto Cov : Files[i].Cov)189if (InitialCov.find(Cov) == InitialCov.end())190NewCov->insert(Cov);191}192return NewFeatures->size();193}194195std::set<uint32_t> Merger::AllFeatures() const {196std::set<uint32_t> S;197for (auto &File : Files)198S.insert(File.Features.begin(), File.Features.end());199return S;200}201202// Inner process. May crash if the target crashes.203void Fuzzer::CrashResistantMergeInternalStep(const std::string &CFPath,204bool IsSetCoverMerge) {205Printf("MERGE-INNER: using the control file '%s'\n", CFPath.c_str());206Merger M;207std::ifstream IF(CFPath);208M.ParseOrExit(IF, false);209IF.close();210if (!M.LastFailure.empty())211Printf("MERGE-INNER: '%s' caused a failure at the previous merge step\n",212M.LastFailure.c_str());213214Printf("MERGE-INNER: %zd total files;"215" %zd processed earlier; will process %zd files now\n",216M.Files.size(), M.FirstNotProcessedFile,217M.Files.size() - M.FirstNotProcessedFile);218219std::ofstream OF(CFPath, std::ofstream::out | std::ofstream::app);220std::set<size_t> AllFeatures;221auto PrintStatsWrapper = [this, &AllFeatures](const char* Where) {222this->PrintStats(Where, "\n", 0, AllFeatures.size());223};224std::set<const TracePC::PCTableEntry *> AllPCs;225for (size_t i = M.FirstNotProcessedFile; i < M.Files.size(); i++) {226Fuzzer::MaybeExitGracefully();227auto U = FileToVector(M.Files[i].Name);228if (U.size() > MaxInputLen) {229U.resize(MaxInputLen);230U.shrink_to_fit();231}232233// Write the pre-run marker.234OF << "STARTED " << i << " " << U.size() << "\n";235OF.flush(); // Flush is important since Command::Execute may crash.236// Run.237TPC.ResetMaps();238ExecuteCallback(U.data(), U.size());239// Collect coverage. We are iterating over the files in this order:240// * First, files in the initial corpus ordered by size, smallest first.241// * Then, all other files, smallest first.242std::set<size_t> Features;243if (IsSetCoverMerge)244TPC.CollectFeatures([&](size_t Feature) { Features.insert(Feature); });245else246TPC.CollectFeatures([&](size_t Feature) {247if (AllFeatures.insert(Feature).second)248Features.insert(Feature);249});250TPC.UpdateObservedPCs();251// Show stats.252if (!(TotalNumberOfRuns & (TotalNumberOfRuns - 1)))253PrintStatsWrapper("pulse ");254if (TotalNumberOfRuns == M.NumFilesInFirstCorpus)255PrintStatsWrapper("LOADED");256// Write the post-run marker and the coverage.257OF << "FT " << i;258for (size_t F : Features)259OF << " " << F;260OF << "\n";261OF << "COV " << i;262TPC.ForEachObservedPC([&](const TracePC::PCTableEntry *TE) {263if (AllPCs.insert(TE).second)264OF << " " << TPC.PCTableEntryIdx(TE);265});266OF << "\n";267OF.flush();268}269PrintStatsWrapper("DONE ");270}271272// Merges all corpora into the first corpus. A file is added into273// the first corpus only if it adds new features. Unlike `Merger::Merge`,274// this implementation calculates an approximation of the minimum set275// of corpora files, that cover all known features (set cover problem).276// Generally, this means that files with more features are preferred for277// merge into the first corpus. When two files have the same number of278// features, the smaller one is preferred.279size_t Merger::SetCoverMerge(const std::set<uint32_t> &InitialFeatures,280std::set<uint32_t> *NewFeatures,281const std::set<uint32_t> &InitialCov,282std::set<uint32_t> *NewCov,283std::vector<std::string> *NewFiles) {284assert(NumFilesInFirstCorpus <= Files.size());285NewFiles->clear();286NewFeatures->clear();287NewCov->clear();288std::set<uint32_t> AllFeatures;289// 1 << 21 - 1 is the maximum feature index.290// See 'kFeatureSetSize' in 'FuzzerCorpus.h'.291const uint32_t kFeatureSetSize = 1 << 21;292std::vector<bool> Covered(kFeatureSetSize, false);293size_t NumCovered = 0;294295std::set<uint32_t> ExistingFeatures = InitialFeatures;296for (size_t i = 0; i < NumFilesInFirstCorpus; ++i)297ExistingFeatures.insert(Files[i].Features.begin(), Files[i].Features.end());298299// Mark the existing features as covered.300for (const auto &F : ExistingFeatures) {301if (!Covered[F % kFeatureSetSize]) {302++NumCovered;303Covered[F % kFeatureSetSize] = true;304}305// Calculate an underestimation of the set of covered features306// since the `Covered` bitvector is smaller than the feature range.307AllFeatures.insert(F % kFeatureSetSize);308}309310std::set<size_t> RemainingFiles;311for (size_t i = NumFilesInFirstCorpus; i < Files.size(); ++i) {312// Construct an incremental sequence which represent the313// indices to all files (excluding those in the initial corpus).314// RemainingFiles = range(NumFilesInFirstCorpus..Files.size()).315RemainingFiles.insert(i);316// Insert this file's unique features to all features.317for (const auto &F : Files[i].Features)318AllFeatures.insert(F % kFeatureSetSize);319}320321// Integrate files into Covered until set is complete.322while (NumCovered != AllFeatures.size()) {323// Index to file with largest number of unique features.324size_t MaxFeaturesIndex = NumFilesInFirstCorpus;325// Indices to remove from RemainingFiles.326std::set<size_t> RemoveIndices;327// Running max unique feature count.328// Updated upon finding a file with more features.329size_t MaxNumFeatures = 0;330331// Iterate over all files not yet integrated into Covered,332// to find the file which has the largest number of333// features that are not already in Covered.334for (const auto &i : RemainingFiles) {335const auto &File = Files[i];336size_t CurrentUnique = 0;337// Count number of features in this file338// which are not yet in Covered.339for (const auto &F : File.Features)340if (!Covered[F % kFeatureSetSize])341++CurrentUnique;342343if (CurrentUnique == 0) {344// All features in this file are already in Covered: skip next time.345RemoveIndices.insert(i);346} else if (CurrentUnique > MaxNumFeatures ||347(CurrentUnique == MaxNumFeatures &&348File.Size < Files[MaxFeaturesIndex].Size)) {349// Update the max features file based on unique features350// Break ties by selecting smaller files.351MaxNumFeatures = CurrentUnique;352MaxFeaturesIndex = i;353}354}355// Must be a valid index/356assert(MaxFeaturesIndex < Files.size());357// Remove any feature-less files found.358for (const auto &i : RemoveIndices)359RemainingFiles.erase(i);360if (MaxNumFeatures == 0) {361// Did not find a file that adds unique features.362// This means that we should have no remaining files.363assert(RemainingFiles.size() == 0);364assert(NumCovered == AllFeatures.size());365break;366}367368// MaxFeaturesIndex must be an element of Remaining.369assert(RemainingFiles.find(MaxFeaturesIndex) != RemainingFiles.end());370// Remove the file with the most features from Remaining.371RemainingFiles.erase(MaxFeaturesIndex);372const auto &MaxFeatureFile = Files[MaxFeaturesIndex];373// Add the features of the max feature file to Covered.374for (const auto &F : MaxFeatureFile.Features) {375if (!Covered[F % kFeatureSetSize]) {376++NumCovered;377Covered[F % kFeatureSetSize] = true;378NewFeatures->insert(F);379}380}381// Add the index to this file to the result.382NewFiles->push_back(MaxFeatureFile.Name);383// Update NewCov with the additional coverage384// that MaxFeatureFile provides.385for (const auto &C : MaxFeatureFile.Cov)386if (InitialCov.find(C) == InitialCov.end())387NewCov->insert(C);388}389390return NewFeatures->size();391}392393static size_t394WriteNewControlFile(const std::string &CFPath,395const std::vector<SizedFile> &OldCorpus,396const std::vector<SizedFile> &NewCorpus,397const std::vector<MergeFileInfo> &KnownFiles) {398std::unordered_set<std::string> FilesToSkip;399for (auto &SF: KnownFiles)400FilesToSkip.insert(SF.Name);401402std::vector<std::string> FilesToUse;403auto MaybeUseFile = [=, &FilesToUse](std::string Name) {404if (FilesToSkip.find(Name) == FilesToSkip.end())405FilesToUse.push_back(Name);406};407for (auto &SF: OldCorpus)408MaybeUseFile(SF.File);409auto FilesToUseFromOldCorpus = FilesToUse.size();410for (auto &SF: NewCorpus)411MaybeUseFile(SF.File);412413RemoveFile(CFPath);414std::ofstream ControlFile(CFPath);415ControlFile << FilesToUse.size() << "\n";416ControlFile << FilesToUseFromOldCorpus << "\n";417for (auto &FN: FilesToUse)418ControlFile << FN << "\n";419420if (!ControlFile) {421Printf("MERGE-OUTER: failed to write to the control file: %s\n",422CFPath.c_str());423exit(1);424}425426return FilesToUse.size();427}428429// Outer process. Does not call the target code and thus should not fail.430void CrashResistantMerge(const std::vector<std::string> &Args,431const std::vector<SizedFile> &OldCorpus,432const std::vector<SizedFile> &NewCorpus,433std::vector<std::string> *NewFiles,434const std::set<uint32_t> &InitialFeatures,435std::set<uint32_t> *NewFeatures,436const std::set<uint32_t> &InitialCov,437std::set<uint32_t> *NewCov, const std::string &CFPath,438bool V, /*Verbose*/439bool IsSetCoverMerge) {440if (NewCorpus.empty() && OldCorpus.empty()) return; // Nothing to merge.441size_t NumAttempts = 0;442std::vector<MergeFileInfo> KnownFiles;443if (FileSize(CFPath)) {444VPrintf(V, "MERGE-OUTER: non-empty control file provided: '%s'\n",445CFPath.c_str());446Merger M;447std::ifstream IF(CFPath);448if (M.Parse(IF, /*ParseCoverage=*/true)) {449VPrintf(V, "MERGE-OUTER: control file ok, %zd files total,"450" first not processed file %zd\n",451M.Files.size(), M.FirstNotProcessedFile);452if (!M.LastFailure.empty())453VPrintf(V, "MERGE-OUTER: '%s' will be skipped as unlucky "454"(merge has stumbled on it the last time)\n",455M.LastFailure.c_str());456if (M.FirstNotProcessedFile >= M.Files.size()) {457// Merge has already been completed with the given merge control file.458if (M.Files.size() == OldCorpus.size() + NewCorpus.size()) {459VPrintf(460V,461"MERGE-OUTER: nothing to do, merge has been completed before\n");462exit(0);463}464465// Number of input files likely changed, start merge from scratch, but466// reuse coverage information from the given merge control file.467VPrintf(468V,469"MERGE-OUTER: starting merge from scratch, but reusing coverage "470"information from the given control file\n");471KnownFiles = M.Files;472} else {473// There is a merge in progress, continue.474NumAttempts = M.Files.size() - M.FirstNotProcessedFile;475}476} else {477VPrintf(V, "MERGE-OUTER: bad control file, will overwrite it\n");478}479}480481if (!NumAttempts) {482// The supplied control file is empty or bad, create a fresh one.483VPrintf(V, "MERGE-OUTER: "484"%zd files, %zd in the initial corpus, %zd processed earlier\n",485OldCorpus.size() + NewCorpus.size(), OldCorpus.size(),486KnownFiles.size());487NumAttempts = WriteNewControlFile(CFPath, OldCorpus, NewCorpus, KnownFiles);488}489490// Execute the inner process until it passes.491// Every inner process should execute at least one input.492Command BaseCmd(Args);493BaseCmd.removeFlag("merge");494BaseCmd.removeFlag("set_cover_merge");495BaseCmd.removeFlag("fork");496BaseCmd.removeFlag("collect_data_flow");497for (size_t Attempt = 1; Attempt <= NumAttempts; Attempt++) {498Fuzzer::MaybeExitGracefully();499VPrintf(V, "MERGE-OUTER: attempt %zd\n", Attempt);500Command Cmd(BaseCmd);501Cmd.addFlag("merge_control_file", CFPath);502// If we are going to use the set cover implementation for503// minimization add the merge_inner=2 internal flag.504Cmd.addFlag("merge_inner", IsSetCoverMerge ? "2" : "1");505if (!V) {506Cmd.setOutputFile(getDevNull());507Cmd.combineOutAndErr();508}509auto ExitCode = ExecuteCommand(Cmd);510if (!ExitCode) {511VPrintf(V, "MERGE-OUTER: successful in %zd attempt(s)\n", Attempt);512break;513}514}515// Read the control file and do the merge.516Merger M;517std::ifstream IF(CFPath);518IF.seekg(0, IF.end);519VPrintf(V, "MERGE-OUTER: the control file has %zd bytes\n",520(size_t)IF.tellg());521IF.seekg(0, IF.beg);522M.ParseOrExit(IF, true);523IF.close();524VPrintf(V,525"MERGE-OUTER: consumed %zdMb (%zdMb rss) to parse the control file\n",526M.ApproximateMemoryConsumption() >> 20, GetPeakRSSMb());527528M.Files.insert(M.Files.end(), KnownFiles.begin(), KnownFiles.end());529if (IsSetCoverMerge)530M.SetCoverMerge(InitialFeatures, NewFeatures, InitialCov, NewCov, NewFiles);531else532M.Merge(InitialFeatures, NewFeatures, InitialCov, NewCov, NewFiles);533VPrintf(V, "MERGE-OUTER: %zd new files with %zd new features added; "534"%zd new coverage edges\n",535NewFiles->size(), NewFeatures->size(), NewCov->size());536}537538} // namespace fuzzer539540541