diff options
author | shadchin <shadchin@yandex-team.ru> | 2022-02-10 16:44:30 +0300 |
---|---|---|
committer | Daniil Cherednik <dcherednik@yandex-team.ru> | 2022-02-10 16:44:30 +0300 |
commit | 2598ef1d0aee359b4b6d5fdd1758916d5907d04f (patch) | |
tree | 012bb94d777798f1f56ac1cec429509766d05181 /contrib/libs/llvm12/include/llvm/Analysis/IRSimilarityIdentifier.h | |
parent | 6751af0b0c1b952fede40b19b71da8025b5d8bcf (diff) | |
download | ydb-2598ef1d0aee359b4b6d5fdd1758916d5907d04f.tar.gz |
Restoring authorship annotation for <shadchin@yandex-team.ru>. Commit 1 of 2.
Diffstat (limited to 'contrib/libs/llvm12/include/llvm/Analysis/IRSimilarityIdentifier.h')
-rw-r--r-- | contrib/libs/llvm12/include/llvm/Analysis/IRSimilarityIdentifier.h | 1600 |
1 files changed, 800 insertions, 800 deletions
diff --git a/contrib/libs/llvm12/include/llvm/Analysis/IRSimilarityIdentifier.h b/contrib/libs/llvm12/include/llvm/Analysis/IRSimilarityIdentifier.h index 6c765c5c00..4544bfc8b9 100644 --- a/contrib/libs/llvm12/include/llvm/Analysis/IRSimilarityIdentifier.h +++ b/contrib/libs/llvm12/include/llvm/Analysis/IRSimilarityIdentifier.h @@ -1,800 +1,800 @@ -#pragma once - -#ifdef __GNUC__ -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#endif - -//===- IRSimilarityIdentifier.h - Find similarity in a module --------------==// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// \file -// Interface file for the IRSimilarityIdentifier for identifying similarities in -// IR including the IRInstructionMapper, which maps an Instruction to unsigned -// integers. -// -// Two sequences of instructions are called "similar" if they perform the same -// series of operations for all inputs. -// -// \code -// %1 = add i32 %a, 10 -// %2 = add i32 %a, %1 -// %3 = icmp slt icmp %1, %2 -// \endcode -// -// and -// -// \code -// %1 = add i32 11, %a -// %2 = sub i32 %a, %1 -// %3 = icmp sgt icmp %2, %1 -// \endcode -// -// ultimately have the same result, even if the inputs, and structure are -// slightly different. -// -// For instructions, we do not worry about operands that do not have fixed -// semantic meaning to the program. We consider the opcode that the instruction -// has, the types, parameters, and extra information such as the function name, -// or comparison predicate. These are used to create a hash to map instructions -// to integers to be used in similarity matching in sequences of instructions -// -// Terminology: -// An IRSimilarityCandidate is a region of IRInstructionData (wrapped -// Instructions), usually used to denote a region of similarity has been found. -// -// A SimilarityGroup is a set of IRSimilarityCandidates that are structurally -// similar to one another. -// -//===----------------------------------------------------------------------===// - -#ifndef LLVM_ANALYSIS_IRSIMILARITYIDENTIFIER_H -#define LLVM_ANALYSIS_IRSIMILARITYIDENTIFIER_H - -#include "llvm/IR/InstVisitor.h" -#include "llvm/IR/Instructions.h" -#include "llvm/IR/Module.h" -#include "llvm/IR/PassManager.h" -#include "llvm/Pass.h" -#include "llvm/Support/Allocator.h" - -namespace llvm { -namespace IRSimilarity { - -struct IRInstructionDataList; - -/// This represents what is and is not supported when finding similarity in -/// Instructions. -/// -/// Legal Instructions are considered when looking at similarity between -/// Instructions. -/// -/// Illegal Instructions cannot be considered when looking for similarity -/// between Instructions. They act as boundaries between similarity regions. -/// -/// Invisible Instructions are skipped over during analysis. -// TODO: Shared with MachineOutliner -enum InstrType { Legal, Illegal, Invisible }; - -/// This provides the utilities for hashing an Instruction to an unsigned -/// integer. Two IRInstructionDatas produce the same hash value when their -/// underlying Instructions perform the same operation (even if they don't have -/// the same input operands.) -/// As a more concrete example, consider the following: -/// -/// \code -/// %add1 = add i32 %a, %b -/// %add2 = add i32 %c, %d -/// %add3 = add i64 %e, %f -/// \endcode -/// -// Then the IRInstructionData wrappers for these Instructions may be hashed like -/// so: -/// -/// \code -/// ; These two adds have the same types and operand types, so they hash to the -/// ; same number. -/// %add1 = add i32 %a, %b ; Hash: 1 -/// %add2 = add i32 %c, %d ; Hash: 1 -/// ; This add produces an i64. This differentiates it from %add1 and %add2. So, -/// ; it hashes to a different number. -/// %add3 = add i64 %e, %f; Hash: 2 -/// \endcode -/// -/// -/// This hashing scheme will be used to represent the program as a very long -/// string. This string can then be placed in a data structure which can be used -/// for similarity queries. -/// -/// TODO: Handle types of Instructions which can be equal even with different -/// operands. (E.g. comparisons with swapped predicates.) -/// TODO: Handle CallInsts, which are only checked for function type -/// by \ref isSameOperationAs. -/// TODO: Handle GetElementPtrInsts, as some of the operands have to be the -/// exact same, and some do not. -struct IRInstructionData : ilist_node<IRInstructionData> { - - /// The source Instruction that is being wrapped. - Instruction *Inst = nullptr; - /// The values of the operands in the Instruction. - SmallVector<Value *, 4> OperVals; - /// The legality of the wrapped instruction. This is informed by InstrType, - /// and is used when checking when two instructions are considered similar. - /// If either instruction is not legal, the instructions are automatically not - /// considered similar. - bool Legal; - - /// This is only relevant if we are wrapping a CmpInst where we needed to - /// change the predicate of a compare instruction from a greater than form - /// to a less than form. It is None otherwise. - Optional<CmpInst::Predicate> RevisedPredicate; - - /// Gather the information that is difficult to gather for an Instruction, or - /// is changed. i.e. the operands of an Instruction and the Types of those - /// operands. This extra information allows for similarity matching to make - /// assertions that allow for more flexibility when checking for whether an - /// Instruction performs the same operation. - IRInstructionData(Instruction &I, bool Legality, IRInstructionDataList &IDL); - - /// Get the predicate that the compare instruction is using for hashing the - /// instruction. the IRInstructionData must be wrapping a CmpInst. - CmpInst::Predicate getPredicate() const; - - /// A function that swaps the predicates to their less than form if they are - /// in a greater than form. Otherwise, the predicate is unchanged. - /// - /// \param CI - The comparison operation to find a consistent preidcate for. - /// \return the consistent comparison predicate. - static CmpInst::Predicate predicateForConsistency(CmpInst *CI); - - /// Hashes \p Value based on its opcode, types, and operand types. - /// Two IRInstructionData instances produce the same hash when they perform - /// the same operation. - /// - /// As a simple example, consider the following instructions. - /// - /// \code - /// %add1 = add i32 %x1, %y1 - /// %add2 = add i32 %x2, %y2 - /// - /// %sub = sub i32 %x1, %y1 - /// - /// %add_i64 = add i64 %x2, %y2 - /// \endcode - /// - /// Because the first two adds operate the same types, and are performing the - /// same action, they will be hashed to the same value. - /// - /// However, the subtraction instruction is not the same as an addition, and - /// will be hashed to a different value. - /// - /// Finally, the last add has a different type compared to the first two add - /// instructions, so it will also be hashed to a different value that any of - /// the previous instructions. - /// - /// \param [in] ID - The IRInstructionData instance to be hashed. - /// \returns A hash_value of the IRInstructionData. - friend hash_code hash_value(const IRInstructionData &ID) { - SmallVector<Type *, 4> OperTypes; - for (Value *V : ID.OperVals) - OperTypes.push_back(V->getType()); - - if (isa<CmpInst>(ID.Inst)) - return llvm::hash_combine( - llvm::hash_value(ID.Inst->getOpcode()), - llvm::hash_value(ID.Inst->getType()), - llvm::hash_value(ID.getPredicate()), - llvm::hash_combine_range(OperTypes.begin(), OperTypes.end())); - else if (CallInst *CI = dyn_cast<CallInst>(ID.Inst)) - return llvm::hash_combine( - llvm::hash_value(ID.Inst->getOpcode()), - llvm::hash_value(ID.Inst->getType()), - llvm::hash_value(CI->getCalledFunction()->getName().str()), - llvm::hash_combine_range(OperTypes.begin(), OperTypes.end())); - return llvm::hash_combine( - llvm::hash_value(ID.Inst->getOpcode()), - llvm::hash_value(ID.Inst->getType()), - llvm::hash_combine_range(OperTypes.begin(), OperTypes.end())); - } - - IRInstructionDataList *IDL = nullptr; -}; - -struct IRInstructionDataList : simple_ilist<IRInstructionData> {}; - -/// Compare one IRInstructionData class to another IRInstructionData class for -/// whether they are performing a the same operation, and can mapped to the -/// same value. For regular instructions if the hash value is the same, then -/// they will also be close. -/// -/// \param A - The first IRInstructionData class to compare -/// \param B - The second IRInstructionData class to compare -/// \returns true if \p A and \p B are similar enough to be mapped to the same -/// value. -bool isClose(const IRInstructionData &A, const IRInstructionData &B); - -struct IRInstructionDataTraits : DenseMapInfo<IRInstructionData *> { - static inline IRInstructionData *getEmptyKey() { return nullptr; } - static inline IRInstructionData *getTombstoneKey() { - return reinterpret_cast<IRInstructionData *>(-1); - } - - static unsigned getHashValue(const IRInstructionData *E) { - using llvm::hash_value; - assert(E && "IRInstructionData is a nullptr?"); - return hash_value(*E); - } - - static bool isEqual(const IRInstructionData *LHS, - const IRInstructionData *RHS) { - if (RHS == getEmptyKey() || RHS == getTombstoneKey() || - LHS == getEmptyKey() || LHS == getTombstoneKey()) - return LHS == RHS; - - assert(LHS && RHS && "nullptr should have been caught by getEmptyKey?"); - return isClose(*LHS, *RHS); - } -}; - -/// Helper struct for converting the Instructions in a Module into a vector of -/// unsigned integers. This vector of unsigned integers can be thought of as a -/// "numeric string". This numeric string can then be queried by, for example, -/// data structures that find repeated substrings. -/// -/// This hashing is done per BasicBlock in the module. To hash Instructions -/// based off of their operations, each Instruction is wrapped in an -/// IRInstructionData struct. The unsigned integer for an IRInstructionData -/// depends on: -/// - The hash provided by the IRInstructionData. -/// - Which member of InstrType the IRInstructionData is classified as. -// See InstrType for more details on the possible classifications, and how they -// manifest in the numeric string. -/// -/// The numeric string for an individual BasicBlock is terminated by an unique -/// unsigned integer. This prevents data structures which rely on repetition -/// from matching across BasicBlocks. (For example, the SuffixTree.) -/// As a concrete example, if we have the following two BasicBlocks: -/// \code -/// bb0: -/// %add1 = add i32 %a, %b -/// %add2 = add i32 %c, %d -/// %add3 = add i64 %e, %f -/// bb1: -/// %sub = sub i32 %c, %d -/// \endcode -/// We may hash the Instructions like this (via IRInstructionData): -/// \code -/// bb0: -/// %add1 = add i32 %a, %b ; Hash: 1 -/// %add2 = add i32 %c, %d; Hash: 1 -/// %add3 = add i64 %e, %f; Hash: 2 -/// bb1: -/// %sub = sub i32 %c, %d; Hash: 3 -/// %add4 = add i32 %c, %d ; Hash: 1 -/// \endcode -/// And produce a "numeric string representation" like so: -/// 1, 1, 2, unique_integer_1, 3, 1, unique_integer_2 -/// -/// TODO: This is very similar to the MachineOutliner, and should be -/// consolidated into the same interface. -struct IRInstructionMapper { - /// The starting illegal instruction number to map to. - /// - /// Set to -3 for compatibility with DenseMapInfo<unsigned>. - unsigned IllegalInstrNumber = static_cast<unsigned>(-3); - - /// The next available integer to assign to a legal Instruction to. - unsigned LegalInstrNumber = 0; - - /// Correspondence from IRInstructionData to unsigned integers. - DenseMap<IRInstructionData *, unsigned, IRInstructionDataTraits> - InstructionIntegerMap; - - /// Set if we added an illegal number in the previous step. - /// Since each illegal number is unique, we only need one of them between - /// each range of legal numbers. This lets us make sure we don't add more - /// than one illegal number per range. - bool AddedIllegalLastTime = false; - - /// Marks whether we found a illegal instruction in the previous step. - bool CanCombineWithPrevInstr = false; - - /// Marks whether we have found a set of instructions that is long enough - /// to be considered for similarity. - bool HaveLegalRange = false; - - /// This allocator pointer is in charge of holding on to the IRInstructionData - /// so it is not deallocated until whatever external tool is using it is done - /// with the information. - SpecificBumpPtrAllocator<IRInstructionData> *InstDataAllocator = nullptr; - - /// This allocator pointer is in charge of creating the IRInstructionDataList - /// so it is not deallocated until whatever external tool is using it is done - /// with the information. - SpecificBumpPtrAllocator<IRInstructionDataList> *IDLAllocator = nullptr; - - /// Get an allocated IRInstructionData struct using the InstDataAllocator. - /// - /// \param I - The Instruction to wrap with IRInstructionData. - /// \param Legality - A boolean value that is true if the instruction is to - /// be considered for similarity, and false if not. - /// \param IDL - The InstructionDataList that the IRInstructionData is - /// inserted into. - /// \returns An allocated IRInstructionData struct. - IRInstructionData *allocateIRInstructionData(Instruction &I, bool Legality, - IRInstructionDataList &IDL); - - /// Get an allocated IRInstructionDataList object using the IDLAllocator. - /// - /// \returns An allocated IRInstructionDataList object. - IRInstructionDataList *allocateIRInstructionDataList(); - - IRInstructionDataList *IDL = nullptr; - - /// Maps the Instructions in a BasicBlock \p BB to legal or illegal integers - /// determined by \p InstrType. Two Instructions are mapped to the same value - /// if they are close as defined by the InstructionData class above. - /// - /// \param [in] BB - The BasicBlock to be mapped to integers. - /// \param [in,out] InstrList - Vector of IRInstructionData to append to. - /// \param [in,out] IntegerMapping - Vector of unsigned integers to append to. - void convertToUnsignedVec(BasicBlock &BB, - std::vector<IRInstructionData *> &InstrList, - std::vector<unsigned> &IntegerMapping); - - /// Maps an Instruction to a legal integer. - /// - /// \param [in] It - The Instruction to be mapped to an integer. - /// \param [in,out] IntegerMappingForBB - Vector of unsigned integers to - /// append to. - /// \param [in,out] InstrListForBB - Vector of InstructionData to append to. - /// \returns The integer \p It was mapped to. - unsigned mapToLegalUnsigned(BasicBlock::iterator &It, - std::vector<unsigned> &IntegerMappingForBB, - std::vector<IRInstructionData *> &InstrListForBB); - - /// Maps an Instruction to an illegal integer. - /// - /// \param [in] It - The \p Instruction to be mapped to an integer. - /// \param [in,out] IntegerMappingForBB - Vector of unsigned integers to - /// append to. - /// \param [in,out] InstrListForBB - Vector of IRInstructionData to append to. - /// \param End - true if creating a dummy IRInstructionData at the end of a - /// basic block. - /// \returns The integer \p It was mapped to. - unsigned mapToIllegalUnsigned( - BasicBlock::iterator &It, std::vector<unsigned> &IntegerMappingForBB, - std::vector<IRInstructionData *> &InstrListForBB, bool End = false); - - IRInstructionMapper(SpecificBumpPtrAllocator<IRInstructionData> *IDA, - SpecificBumpPtrAllocator<IRInstructionDataList> *IDLA) - : InstDataAllocator(IDA), IDLAllocator(IDLA) { - // Make sure that the implementation of DenseMapInfo<unsigned> hasn't - // changed. - assert(DenseMapInfo<unsigned>::getEmptyKey() == static_cast<unsigned>(-1) && - "DenseMapInfo<unsigned>'s empty key isn't -1!"); - assert(DenseMapInfo<unsigned>::getTombstoneKey() == - static_cast<unsigned>(-2) && - "DenseMapInfo<unsigned>'s tombstone key isn't -2!"); - - IDL = new (IDLAllocator->Allocate()) - IRInstructionDataList(); - } - - /// Custom InstVisitor to classify different instructions for whether it can - /// be analyzed for similarity. - struct InstructionClassification - : public InstVisitor<InstructionClassification, InstrType> { - InstructionClassification() {} - - // TODO: Determine a scheme to resolve when the label is similar enough. - InstrType visitBranchInst(BranchInst &BI) { return Illegal; } - // TODO: Determine a scheme to resolve when the labels are similar enough. - InstrType visitPHINode(PHINode &PN) { return Illegal; } - // TODO: Handle allocas. - InstrType visitAllocaInst(AllocaInst &AI) { return Illegal; } - // We exclude variable argument instructions since variable arguments - // requires extra checking of the argument list. - InstrType visitVAArgInst(VAArgInst &VI) { return Illegal; } - // We exclude all exception handling cases since they are so context - // dependent. - InstrType visitLandingPadInst(LandingPadInst &LPI) { return Illegal; } - InstrType visitFuncletPadInst(FuncletPadInst &FPI) { return Illegal; } - // DebugInfo should be included in the regions, but should not be - // analyzed for similarity as it has no bearing on the outcome of the - // program. - InstrType visitDbgInfoIntrinsic(DbgInfoIntrinsic &DII) { return Invisible; } - // TODO: Handle specific intrinsics. - InstrType visitIntrinsicInst(IntrinsicInst &II) { return Illegal; } - // We only allow call instructions where the function has a name and - // is not an indirect call. - InstrType visitCallInst(CallInst &CI) { - Function *F = CI.getCalledFunction(); - if (!F || CI.isIndirectCall() || !F->hasName()) - return Illegal; - return Legal; - } - // TODO: We do not current handle similarity that changes the control flow. - InstrType visitInvokeInst(InvokeInst &II) { return Illegal; } - // TODO: We do not current handle similarity that changes the control flow. - InstrType visitCallBrInst(CallBrInst &CBI) { return Illegal; } - // TODO: Handle interblock similarity. - InstrType visitTerminator(Instruction &I) { return Illegal; } - InstrType visitInstruction(Instruction &I) { return Legal; } - }; - - /// Maps an Instruction to a member of InstrType. - InstructionClassification InstClassifier; -}; - -/// This is a class that wraps a range of IRInstructionData from one point to -/// another in the vector of IRInstructionData, which is a region of the -/// program. It is also responsible for defining the structure within this -/// region of instructions. -/// -/// The structure of a region is defined through a value numbering system -/// assigned to each unique value in a region at the creation of the -/// IRSimilarityCandidate. -/// -/// For example, for each Instruction we add a mapping for each new -/// value seen in that Instruction. -/// IR: Mapping Added: -/// %add1 = add i32 %a, c1 %add1 -> 3, %a -> 1, c1 -> 2 -/// %add2 = add i32 %a, %1 %add2 -> 4 -/// %add3 = add i32 c2, c1 %add3 -> 6, c2 -> 5 -/// -/// We can compare IRSimilarityCandidates against one another. -/// The \ref isSimilar function compares each IRInstructionData against one -/// another and if we have the same sequences of IRInstructionData that would -/// create the same hash, we have similar IRSimilarityCandidates. -/// -/// We can also compare the structure of IRSimilarityCandidates. If we can -/// create a mapping of registers in the region contained by one -/// IRSimilarityCandidate to the region contained by different -/// IRSimilarityCandidate, they can be considered structurally similar. -/// -/// IRSimilarityCandidate1: IRSimilarityCandidate2: -/// %add1 = add i32 %a, %b %add1 = add i32 %d, %e -/// %add2 = add i32 %a, %c %add2 = add i32 %d, %f -/// %add3 = add i32 c1, c2 %add3 = add i32 c3, c4 -/// -/// Can have the following mapping from candidate to candidate of: -/// %a -> %d, %b -> %e, %c -> %f, c1 -> c3, c2 -> c4 -/// and can be considered similar. -/// -/// IRSimilarityCandidate1: IRSimilarityCandidate2: -/// %add1 = add i32 %a, %b %add1 = add i32 %d, c4 -/// %add2 = add i32 %a, %c %add2 = add i32 %d, %f -/// %add3 = add i32 c1, c2 %add3 = add i32 c3, c4 -/// -/// We cannot create the same mapping since the use of c4 is not used in the -/// same way as %b or c2. -class IRSimilarityCandidate { -private: - /// The start index of this IRSimilarityCandidate in the instruction list. - unsigned StartIdx = 0; - - /// The number of instructions in this IRSimilarityCandidate. - unsigned Len = 0; - - /// The first instruction in this IRSimilarityCandidate. - IRInstructionData *FirstInst = nullptr; - - /// The last instruction in this IRSimilarityCandidate. - IRInstructionData *LastInst = nullptr; - - /// Global Value Numbering structures - /// @{ - /// Stores the mapping of the value to the number assigned to it in the - /// IRSimilarityCandidate. - DenseMap<Value *, unsigned> ValueToNumber; - /// Stores the mapping of the number to the value assigned this number. - DenseMap<unsigned, Value *> NumberToValue; - /// @} - -public: - /// \param StartIdx - The starting location of the region. - /// \param Len - The length of the region. - /// \param FirstInstIt - The starting IRInstructionData of the region. - /// \param LastInstIt - The ending IRInstructionData of the region. - IRSimilarityCandidate(unsigned StartIdx, unsigned Len, - IRInstructionData *FirstInstIt, - IRInstructionData *LastInstIt); - - /// \param A - The first IRInstructionCandidate to compare. - /// \param B - The second IRInstructionCandidate to compare. - /// \returns True when every IRInstructionData in \p A is similar to every - /// IRInstructionData in \p B. - static bool isSimilar(const IRSimilarityCandidate &A, - const IRSimilarityCandidate &B); - - /// \param A - The first IRInstructionCandidate to compare. - /// \param B - The second IRInstructionCandidate to compare. - /// \returns True when every IRInstructionData in \p A is structurally similar - /// to \p B. - static bool compareStructure(const IRSimilarityCandidate &A, - const IRSimilarityCandidate &B); - - struct OperandMapping { - /// The IRSimilarityCandidate that holds the instruction the OperVals were - /// pulled from. - const IRSimilarityCandidate &IRSC; - - /// The operand values to be analyzed. - ArrayRef<Value *> &OperVals; - - /// The current mapping of global value numbers from one IRSimilarityCandidate - /// to another IRSimilarityCandidate. - DenseMap<unsigned, DenseSet<unsigned>> &ValueNumberMapping; - }; - - /// Compare the operands in \p A and \p B and check that the current mapping - /// of global value numbers from \p A to \p B and \p B to \A is consistent. - /// - /// \param A - The first IRInstructionCandidate, operand values, and current - /// operand mappings to compare. - /// \param B - The second IRInstructionCandidate, operand values, and current - /// operand mappings to compare. - /// \returns true if the IRSimilarityCandidates operands are compatible. - static bool compareNonCommutativeOperandMapping(OperandMapping A, - OperandMapping B); - - /// Compare the operands in \p A and \p B and check that the current mapping - /// of global value numbers from \p A to \p B and \p B to \A is consistent - /// given that the operands are commutative. - /// - /// \param A - The first IRInstructionCandidate, operand values, and current - /// operand mappings to compare. - /// \param B - The second IRInstructionCandidate, operand values, and current - /// operand mappings to compare. - /// \returns true if the IRSimilarityCandidates operands are compatible. - static bool compareCommutativeOperandMapping(OperandMapping A, - OperandMapping B); - - /// Compare the start and end indices of the two IRSimilarityCandidates for - /// whether they overlap. If the start instruction of one - /// IRSimilarityCandidate is less than the end instruction of the other, and - /// the start instruction of one is greater than the start instruction of the - /// other, they overlap. - /// - /// \returns true if the IRSimilarityCandidates do not have overlapping - /// instructions. - static bool overlap(const IRSimilarityCandidate &A, - const IRSimilarityCandidate &B); - - /// \returns the number of instructions in this Candidate. - unsigned getLength() const { return Len; } - - /// \returns the start index of this IRSimilarityCandidate. - unsigned getStartIdx() const { return StartIdx; } - - /// \returns the end index of this IRSimilarityCandidate. - unsigned getEndIdx() const { return StartIdx + Len - 1; } - - /// \returns The first IRInstructionData. - IRInstructionData *front() const { return FirstInst; } - /// \returns The last IRInstructionData. - IRInstructionData *back() const { return LastInst; } - - /// \returns The first Instruction. - Instruction *frontInstruction() { return FirstInst->Inst; } - /// \returns The last Instruction - Instruction *backInstruction() { return LastInst->Inst; } - - /// \returns The BasicBlock the IRSimilarityCandidate starts in. - BasicBlock *getStartBB() { return FirstInst->Inst->getParent(); } - /// \returns The BasicBlock the IRSimilarityCandidate ends in. - BasicBlock *getEndBB() { return LastInst->Inst->getParent(); } - - /// \returns The Function that the IRSimilarityCandidate is located in. - Function *getFunction() { return getStartBB()->getParent(); } - - /// Finds the positive number associated with \p V if it has been mapped. - /// \param [in] V - the Value to find. - /// \returns The positive number corresponding to the value. - /// \returns None if not present. - Optional<unsigned> getGVN(Value *V) { - assert(V != nullptr && "Value is a nullptr?"); - DenseMap<Value *, unsigned>::iterator VNIt = ValueToNumber.find(V); - if (VNIt == ValueToNumber.end()) - return None; - return VNIt->second; - } - - /// Finds the Value associate with \p Num if it exists. - /// \param [in] Num - the number to find. - /// \returns The Value associated with the number. - /// \returns None if not present. - Optional<Value *> fromGVN(unsigned Num) { - DenseMap<unsigned, Value *>::iterator VNIt = NumberToValue.find(Num); - if (VNIt == NumberToValue.end()) - return None; - assert(VNIt->second != nullptr && "Found value is a nullptr!"); - return VNIt->second; - } - - /// \param RHS -The IRSimilarityCandidate to compare against - /// \returns true if the IRSimilarityCandidate is occurs after the - /// IRSimilarityCandidate in the program. - bool operator<(const IRSimilarityCandidate &RHS) const { - return getStartIdx() > RHS.getStartIdx(); - } - - using iterator = IRInstructionDataList::iterator; - iterator begin() const { return iterator(front()); } - iterator end() const { return std::next(iterator(back())); } -}; - -typedef std::vector<IRSimilarityCandidate> SimilarityGroup; -typedef std::vector<SimilarityGroup> SimilarityGroupList; - -/// This class puts all the pieces of the IRInstructionData, -/// IRInstructionMapper, IRSimilarityCandidate together. -/// -/// It first feeds the Module or vector of Modules into the IRInstructionMapper, -/// and puts all the mapped instructions into a single long list of -/// IRInstructionData. -/// -/// The list of unsigned integers is given to the Suffix Tree or similar data -/// structure to find repeated subsequences. We construct an -/// IRSimilarityCandidate for each instance of the subsequence. We compare them -/// against one another since These repeated subsequences can have different -/// structure. For each different kind of structure found, we create a -/// similarity group. -/// -/// If we had four IRSimilarityCandidates A, B, C, and D where A, B and D are -/// structurally similar to one another, while C is different we would have two -/// SimilarityGroups: -/// -/// SimilarityGroup 1: SimilarityGroup 2 -/// A, B, D C -/// -/// A list of the different similarity groups is then returned after -/// analyzing the module. -class IRSimilarityIdentifier { -public: - IRSimilarityIdentifier() - : Mapper(&InstDataAllocator, &InstDataListAllocator) {} - - /// \param M the module to find similarity in. - explicit IRSimilarityIdentifier(Module &M) - : Mapper(&InstDataAllocator, &InstDataListAllocator) { - findSimilarity(M); - } - -private: - /// Map the instructions in the module to unsigned integers, using mapping - /// already present in the Mapper if possible. - /// - /// \param [in] M Module - To map to integers. - /// \param [in,out] InstrList - The vector to append IRInstructionData to. - /// \param [in,out] IntegerMapping - The vector to append integers to. - void populateMapper(Module &M, std::vector<IRInstructionData *> &InstrList, - std::vector<unsigned> &IntegerMapping); - - /// Map the instructions in the modules vector to unsigned integers, using - /// mapping already present in the mapper if possible. - /// - /// \param [in] Modules - The list of modules to use to populate the mapper - /// \param [in,out] InstrList - The vector to append IRInstructionData to. - /// \param [in,out] IntegerMapping - The vector to append integers to. - void populateMapper(ArrayRef<std::unique_ptr<Module>> &Modules, - std::vector<IRInstructionData *> &InstrList, - std::vector<unsigned> &IntegerMapping); - - /// Find the similarity candidates in \p InstrList and corresponding - /// \p UnsignedVec - /// - /// \param [in,out] InstrList - The vector to append IRInstructionData to. - /// \param [in,out] IntegerMapping - The vector to append integers to. - /// candidates found in the program. - void findCandidates(std::vector<IRInstructionData *> &InstrList, - std::vector<unsigned> &IntegerMapping); - -public: - // Find the IRSimilarityCandidates in the \p Modules and group by structural - // similarity in a SimilarityGroup, each group is returned in a - // SimilarityGroupList. - // - // \param [in] Modules - the modules to analyze. - // \returns The groups of similarity ranges found in the modules. - SimilarityGroupList & - findSimilarity(ArrayRef<std::unique_ptr<Module>> Modules); - - // Find the IRSimilarityCandidates in the given Module grouped by structural - // similarity in a SimilarityGroup, contained inside a SimilarityGroupList. - // - // \param [in] M - the module to analyze. - // \returns The groups of similarity ranges found in the module. - SimilarityGroupList &findSimilarity(Module &M); - - // Clears \ref SimilarityCandidates if it is already filled by a previous run. - void resetSimilarityCandidates() { - // If we've already analyzed a Module or set of Modules, so we must clear - // the SimilarityCandidates to make sure we do not have only old values - // hanging around. - if (SimilarityCandidates.hasValue()) - SimilarityCandidates->clear(); - else - SimilarityCandidates = SimilarityGroupList(); - } - - // \returns The groups of similarity ranges found in the most recently passed - // set of modules. - Optional<SimilarityGroupList> &getSimilarity() { - return SimilarityCandidates; - } - -private: - /// The allocator for IRInstructionData. - SpecificBumpPtrAllocator<IRInstructionData> InstDataAllocator; - - /// The allocator for IRInstructionDataLists. - SpecificBumpPtrAllocator<IRInstructionDataList> InstDataListAllocator; - - /// Map Instructions to unsigned integers and wraps the Instruction in an - /// instance of IRInstructionData. - IRInstructionMapper Mapper; - - /// The SimilarityGroups found with the most recent run of \ref - /// findSimilarity. None if there is no recent run. - Optional<SimilarityGroupList> SimilarityCandidates; -}; - -} // end namespace IRSimilarity - -/// An analysis pass based on legacy pass manager that runs and returns -/// IRSimilarityIdentifier run on the Module. -class IRSimilarityIdentifierWrapperPass : public ModulePass { - std::unique_ptr<IRSimilarity::IRSimilarityIdentifier> IRSI; - -public: - static char ID; - IRSimilarityIdentifierWrapperPass(); - - IRSimilarity::IRSimilarityIdentifier &getIRSI() { return *IRSI; } - const IRSimilarity::IRSimilarityIdentifier &getIRSI() const { return *IRSI; } - - bool doInitialization(Module &M) override; - bool doFinalization(Module &M) override; - bool runOnModule(Module &M) override; - void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.setPreservesAll(); - } -}; - -/// An analysis pass that runs and returns the IRSimilarityIdentifier run on the -/// Module. -class IRSimilarityAnalysis : public AnalysisInfoMixin<IRSimilarityAnalysis> { -public: - typedef IRSimilarity::IRSimilarityIdentifier Result; - - Result run(Module &M, ModuleAnalysisManager &); - -private: - friend AnalysisInfoMixin<IRSimilarityAnalysis>; - static AnalysisKey Key; -}; - -/// Printer pass that uses \c IRSimilarityAnalysis. -class IRSimilarityAnalysisPrinterPass - : public PassInfoMixin<IRSimilarityAnalysisPrinterPass> { - raw_ostream &OS; - -public: - explicit IRSimilarityAnalysisPrinterPass(raw_ostream &OS) : OS(OS) {} - PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM); -}; - -} // end namespace llvm - -#endif // LLVM_ANALYSIS_IRSIMILARITYIDENTIFIER_H - -#ifdef __GNUC__ -#pragma GCC diagnostic pop -#endif +#pragma once + +#ifdef __GNUC__ +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#endif + +//===- IRSimilarityIdentifier.h - Find similarity in a module --------------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// \file +// Interface file for the IRSimilarityIdentifier for identifying similarities in +// IR including the IRInstructionMapper, which maps an Instruction to unsigned +// integers. +// +// Two sequences of instructions are called "similar" if they perform the same +// series of operations for all inputs. +// +// \code +// %1 = add i32 %a, 10 +// %2 = add i32 %a, %1 +// %3 = icmp slt icmp %1, %2 +// \endcode +// +// and +// +// \code +// %1 = add i32 11, %a +// %2 = sub i32 %a, %1 +// %3 = icmp sgt icmp %2, %1 +// \endcode +// +// ultimately have the same result, even if the inputs, and structure are +// slightly different. +// +// For instructions, we do not worry about operands that do not have fixed +// semantic meaning to the program. We consider the opcode that the instruction +// has, the types, parameters, and extra information such as the function name, +// or comparison predicate. These are used to create a hash to map instructions +// to integers to be used in similarity matching in sequences of instructions +// +// Terminology: +// An IRSimilarityCandidate is a region of IRInstructionData (wrapped +// Instructions), usually used to denote a region of similarity has been found. +// +// A SimilarityGroup is a set of IRSimilarityCandidates that are structurally +// similar to one another. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_ANALYSIS_IRSIMILARITYIDENTIFIER_H +#define LLVM_ANALYSIS_IRSIMILARITYIDENTIFIER_H + +#include "llvm/IR/InstVisitor.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/Module.h" +#include "llvm/IR/PassManager.h" +#include "llvm/Pass.h" +#include "llvm/Support/Allocator.h" + +namespace llvm { +namespace IRSimilarity { + +struct IRInstructionDataList; + +/// This represents what is and is not supported when finding similarity in +/// Instructions. +/// +/// Legal Instructions are considered when looking at similarity between +/// Instructions. +/// +/// Illegal Instructions cannot be considered when looking for similarity +/// between Instructions. They act as boundaries between similarity regions. +/// +/// Invisible Instructions are skipped over during analysis. +// TODO: Shared with MachineOutliner +enum InstrType { Legal, Illegal, Invisible }; + +/// This provides the utilities for hashing an Instruction to an unsigned +/// integer. Two IRInstructionDatas produce the same hash value when their +/// underlying Instructions perform the same operation (even if they don't have +/// the same input operands.) +/// As a more concrete example, consider the following: +/// +/// \code +/// %add1 = add i32 %a, %b +/// %add2 = add i32 %c, %d +/// %add3 = add i64 %e, %f +/// \endcode +/// +// Then the IRInstructionData wrappers for these Instructions may be hashed like +/// so: +/// +/// \code +/// ; These two adds have the same types and operand types, so they hash to the +/// ; same number. +/// %add1 = add i32 %a, %b ; Hash: 1 +/// %add2 = add i32 %c, %d ; Hash: 1 +/// ; This add produces an i64. This differentiates it from %add1 and %add2. So, +/// ; it hashes to a different number. +/// %add3 = add i64 %e, %f; Hash: 2 +/// \endcode +/// +/// +/// This hashing scheme will be used to represent the program as a very long +/// string. This string can then be placed in a data structure which can be used +/// for similarity queries. +/// +/// TODO: Handle types of Instructions which can be equal even with different +/// operands. (E.g. comparisons with swapped predicates.) +/// TODO: Handle CallInsts, which are only checked for function type +/// by \ref isSameOperationAs. +/// TODO: Handle GetElementPtrInsts, as some of the operands have to be the +/// exact same, and some do not. +struct IRInstructionData : ilist_node<IRInstructionData> { + + /// The source Instruction that is being wrapped. + Instruction *Inst = nullptr; + /// The values of the operands in the Instruction. + SmallVector<Value *, 4> OperVals; + /// The legality of the wrapped instruction. This is informed by InstrType, + /// and is used when checking when two instructions are considered similar. + /// If either instruction is not legal, the instructions are automatically not + /// considered similar. + bool Legal; + + /// This is only relevant if we are wrapping a CmpInst where we needed to + /// change the predicate of a compare instruction from a greater than form + /// to a less than form. It is None otherwise. + Optional<CmpInst::Predicate> RevisedPredicate; + + /// Gather the information that is difficult to gather for an Instruction, or + /// is changed. i.e. the operands of an Instruction and the Types of those + /// operands. This extra information allows for similarity matching to make + /// assertions that allow for more flexibility when checking for whether an + /// Instruction performs the same operation. + IRInstructionData(Instruction &I, bool Legality, IRInstructionDataList &IDL); + + /// Get the predicate that the compare instruction is using for hashing the + /// instruction. the IRInstructionData must be wrapping a CmpInst. + CmpInst::Predicate getPredicate() const; + + /// A function that swaps the predicates to their less than form if they are + /// in a greater than form. Otherwise, the predicate is unchanged. + /// + /// \param CI - The comparison operation to find a consistent preidcate for. + /// \return the consistent comparison predicate. + static CmpInst::Predicate predicateForConsistency(CmpInst *CI); + + /// Hashes \p Value based on its opcode, types, and operand types. + /// Two IRInstructionData instances produce the same hash when they perform + /// the same operation. + /// + /// As a simple example, consider the following instructions. + /// + /// \code + /// %add1 = add i32 %x1, %y1 + /// %add2 = add i32 %x2, %y2 + /// + /// %sub = sub i32 %x1, %y1 + /// + /// %add_i64 = add i64 %x2, %y2 + /// \endcode + /// + /// Because the first two adds operate the same types, and are performing the + /// same action, they will be hashed to the same value. + /// + /// However, the subtraction instruction is not the same as an addition, and + /// will be hashed to a different value. + /// + /// Finally, the last add has a different type compared to the first two add + /// instructions, so it will also be hashed to a different value that any of + /// the previous instructions. + /// + /// \param [in] ID - The IRInstructionData instance to be hashed. + /// \returns A hash_value of the IRInstructionData. + friend hash_code hash_value(const IRInstructionData &ID) { + SmallVector<Type *, 4> OperTypes; + for (Value *V : ID.OperVals) + OperTypes.push_back(V->getType()); + + if (isa<CmpInst>(ID.Inst)) + return llvm::hash_combine( + llvm::hash_value(ID.Inst->getOpcode()), + llvm::hash_value(ID.Inst->getType()), + llvm::hash_value(ID.getPredicate()), + llvm::hash_combine_range(OperTypes.begin(), OperTypes.end())); + else if (CallInst *CI = dyn_cast<CallInst>(ID.Inst)) + return llvm::hash_combine( + llvm::hash_value(ID.Inst->getOpcode()), + llvm::hash_value(ID.Inst->getType()), + llvm::hash_value(CI->getCalledFunction()->getName().str()), + llvm::hash_combine_range(OperTypes.begin(), OperTypes.end())); + return llvm::hash_combine( + llvm::hash_value(ID.Inst->getOpcode()), + llvm::hash_value(ID.Inst->getType()), + llvm::hash_combine_range(OperTypes.begin(), OperTypes.end())); + } + + IRInstructionDataList *IDL = nullptr; +}; + +struct IRInstructionDataList : simple_ilist<IRInstructionData> {}; + +/// Compare one IRInstructionData class to another IRInstructionData class for +/// whether they are performing a the same operation, and can mapped to the +/// same value. For regular instructions if the hash value is the same, then +/// they will also be close. +/// +/// \param A - The first IRInstructionData class to compare +/// \param B - The second IRInstructionData class to compare +/// \returns true if \p A and \p B are similar enough to be mapped to the same +/// value. +bool isClose(const IRInstructionData &A, const IRInstructionData &B); + +struct IRInstructionDataTraits : DenseMapInfo<IRInstructionData *> { + static inline IRInstructionData *getEmptyKey() { return nullptr; } + static inline IRInstructionData *getTombstoneKey() { + return reinterpret_cast<IRInstructionData *>(-1); + } + + static unsigned getHashValue(const IRInstructionData *E) { + using llvm::hash_value; + assert(E && "IRInstructionData is a nullptr?"); + return hash_value(*E); + } + + static bool isEqual(const IRInstructionData *LHS, + const IRInstructionData *RHS) { + if (RHS == getEmptyKey() || RHS == getTombstoneKey() || + LHS == getEmptyKey() || LHS == getTombstoneKey()) + return LHS == RHS; + + assert(LHS && RHS && "nullptr should have been caught by getEmptyKey?"); + return isClose(*LHS, *RHS); + } +}; + +/// Helper struct for converting the Instructions in a Module into a vector of +/// unsigned integers. This vector of unsigned integers can be thought of as a +/// "numeric string". This numeric string can then be queried by, for example, +/// data structures that find repeated substrings. +/// +/// This hashing is done per BasicBlock in the module. To hash Instructions +/// based off of their operations, each Instruction is wrapped in an +/// IRInstructionData struct. The unsigned integer for an IRInstructionData +/// depends on: +/// - The hash provided by the IRInstructionData. +/// - Which member of InstrType the IRInstructionData is classified as. +// See InstrType for more details on the possible classifications, and how they +// manifest in the numeric string. +/// +/// The numeric string for an individual BasicBlock is terminated by an unique +/// unsigned integer. This prevents data structures which rely on repetition +/// from matching across BasicBlocks. (For example, the SuffixTree.) +/// As a concrete example, if we have the following two BasicBlocks: +/// \code +/// bb0: +/// %add1 = add i32 %a, %b +/// %add2 = add i32 %c, %d +/// %add3 = add i64 %e, %f +/// bb1: +/// %sub = sub i32 %c, %d +/// \endcode +/// We may hash the Instructions like this (via IRInstructionData): +/// \code +/// bb0: +/// %add1 = add i32 %a, %b ; Hash: 1 +/// %add2 = add i32 %c, %d; Hash: 1 +/// %add3 = add i64 %e, %f; Hash: 2 +/// bb1: +/// %sub = sub i32 %c, %d; Hash: 3 +/// %add4 = add i32 %c, %d ; Hash: 1 +/// \endcode +/// And produce a "numeric string representation" like so: +/// 1, 1, 2, unique_integer_1, 3, 1, unique_integer_2 +/// +/// TODO: This is very similar to the MachineOutliner, and should be +/// consolidated into the same interface. +struct IRInstructionMapper { + /// The starting illegal instruction number to map to. + /// + /// Set to -3 for compatibility with DenseMapInfo<unsigned>. + unsigned IllegalInstrNumber = static_cast<unsigned>(-3); + + /// The next available integer to assign to a legal Instruction to. + unsigned LegalInstrNumber = 0; + + /// Correspondence from IRInstructionData to unsigned integers. + DenseMap<IRInstructionData *, unsigned, IRInstructionDataTraits> + InstructionIntegerMap; + + /// Set if we added an illegal number in the previous step. + /// Since each illegal number is unique, we only need one of them between + /// each range of legal numbers. This lets us make sure we don't add more + /// than one illegal number per range. + bool AddedIllegalLastTime = false; + + /// Marks whether we found a illegal instruction in the previous step. + bool CanCombineWithPrevInstr = false; + + /// Marks whether we have found a set of instructions that is long enough + /// to be considered for similarity. + bool HaveLegalRange = false; + + /// This allocator pointer is in charge of holding on to the IRInstructionData + /// so it is not deallocated until whatever external tool is using it is done + /// with the information. + SpecificBumpPtrAllocator<IRInstructionData> *InstDataAllocator = nullptr; + + /// This allocator pointer is in charge of creating the IRInstructionDataList + /// so it is not deallocated until whatever external tool is using it is done + /// with the information. + SpecificBumpPtrAllocator<IRInstructionDataList> *IDLAllocator = nullptr; + + /// Get an allocated IRInstructionData struct using the InstDataAllocator. + /// + /// \param I - The Instruction to wrap with IRInstructionData. + /// \param Legality - A boolean value that is true if the instruction is to + /// be considered for similarity, and false if not. + /// \param IDL - The InstructionDataList that the IRInstructionData is + /// inserted into. + /// \returns An allocated IRInstructionData struct. + IRInstructionData *allocateIRInstructionData(Instruction &I, bool Legality, + IRInstructionDataList &IDL); + + /// Get an allocated IRInstructionDataList object using the IDLAllocator. + /// + /// \returns An allocated IRInstructionDataList object. + IRInstructionDataList *allocateIRInstructionDataList(); + + IRInstructionDataList *IDL = nullptr; + + /// Maps the Instructions in a BasicBlock \p BB to legal or illegal integers + /// determined by \p InstrType. Two Instructions are mapped to the same value + /// if they are close as defined by the InstructionData class above. + /// + /// \param [in] BB - The BasicBlock to be mapped to integers. + /// \param [in,out] InstrList - Vector of IRInstructionData to append to. + /// \param [in,out] IntegerMapping - Vector of unsigned integers to append to. + void convertToUnsignedVec(BasicBlock &BB, + std::vector<IRInstructionData *> &InstrList, + std::vector<unsigned> &IntegerMapping); + + /// Maps an Instruction to a legal integer. + /// + /// \param [in] It - The Instruction to be mapped to an integer. + /// \param [in,out] IntegerMappingForBB - Vector of unsigned integers to + /// append to. + /// \param [in,out] InstrListForBB - Vector of InstructionData to append to. + /// \returns The integer \p It was mapped to. + unsigned mapToLegalUnsigned(BasicBlock::iterator &It, + std::vector<unsigned> &IntegerMappingForBB, + std::vector<IRInstructionData *> &InstrListForBB); + + /// Maps an Instruction to an illegal integer. + /// + /// \param [in] It - The \p Instruction to be mapped to an integer. + /// \param [in,out] IntegerMappingForBB - Vector of unsigned integers to + /// append to. + /// \param [in,out] InstrListForBB - Vector of IRInstructionData to append to. + /// \param End - true if creating a dummy IRInstructionData at the end of a + /// basic block. + /// \returns The integer \p It was mapped to. + unsigned mapToIllegalUnsigned( + BasicBlock::iterator &It, std::vector<unsigned> &IntegerMappingForBB, + std::vector<IRInstructionData *> &InstrListForBB, bool End = false); + + IRInstructionMapper(SpecificBumpPtrAllocator<IRInstructionData> *IDA, + SpecificBumpPtrAllocator<IRInstructionDataList> *IDLA) + : InstDataAllocator(IDA), IDLAllocator(IDLA) { + // Make sure that the implementation of DenseMapInfo<unsigned> hasn't + // changed. + assert(DenseMapInfo<unsigned>::getEmptyKey() == static_cast<unsigned>(-1) && + "DenseMapInfo<unsigned>'s empty key isn't -1!"); + assert(DenseMapInfo<unsigned>::getTombstoneKey() == + static_cast<unsigned>(-2) && + "DenseMapInfo<unsigned>'s tombstone key isn't -2!"); + + IDL = new (IDLAllocator->Allocate()) + IRInstructionDataList(); + } + + /// Custom InstVisitor to classify different instructions for whether it can + /// be analyzed for similarity. + struct InstructionClassification + : public InstVisitor<InstructionClassification, InstrType> { + InstructionClassification() {} + + // TODO: Determine a scheme to resolve when the label is similar enough. + InstrType visitBranchInst(BranchInst &BI) { return Illegal; } + // TODO: Determine a scheme to resolve when the labels are similar enough. + InstrType visitPHINode(PHINode &PN) { return Illegal; } + // TODO: Handle allocas. + InstrType visitAllocaInst(AllocaInst &AI) { return Illegal; } + // We exclude variable argument instructions since variable arguments + // requires extra checking of the argument list. + InstrType visitVAArgInst(VAArgInst &VI) { return Illegal; } + // We exclude all exception handling cases since they are so context + // dependent. + InstrType visitLandingPadInst(LandingPadInst &LPI) { return Illegal; } + InstrType visitFuncletPadInst(FuncletPadInst &FPI) { return Illegal; } + // DebugInfo should be included in the regions, but should not be + // analyzed for similarity as it has no bearing on the outcome of the + // program. + InstrType visitDbgInfoIntrinsic(DbgInfoIntrinsic &DII) { return Invisible; } + // TODO: Handle specific intrinsics. + InstrType visitIntrinsicInst(IntrinsicInst &II) { return Illegal; } + // We only allow call instructions where the function has a name and + // is not an indirect call. + InstrType visitCallInst(CallInst &CI) { + Function *F = CI.getCalledFunction(); + if (!F || CI.isIndirectCall() || !F->hasName()) + return Illegal; + return Legal; + } + // TODO: We do not current handle similarity that changes the control flow. + InstrType visitInvokeInst(InvokeInst &II) { return Illegal; } + // TODO: We do not current handle similarity that changes the control flow. + InstrType visitCallBrInst(CallBrInst &CBI) { return Illegal; } + // TODO: Handle interblock similarity. + InstrType visitTerminator(Instruction &I) { return Illegal; } + InstrType visitInstruction(Instruction &I) { return Legal; } + }; + + /// Maps an Instruction to a member of InstrType. + InstructionClassification InstClassifier; +}; + +/// This is a class that wraps a range of IRInstructionData from one point to +/// another in the vector of IRInstructionData, which is a region of the +/// program. It is also responsible for defining the structure within this +/// region of instructions. +/// +/// The structure of a region is defined through a value numbering system +/// assigned to each unique value in a region at the creation of the +/// IRSimilarityCandidate. +/// +/// For example, for each Instruction we add a mapping for each new +/// value seen in that Instruction. +/// IR: Mapping Added: +/// %add1 = add i32 %a, c1 %add1 -> 3, %a -> 1, c1 -> 2 +/// %add2 = add i32 %a, %1 %add2 -> 4 +/// %add3 = add i32 c2, c1 %add3 -> 6, c2 -> 5 +/// +/// We can compare IRSimilarityCandidates against one another. +/// The \ref isSimilar function compares each IRInstructionData against one +/// another and if we have the same sequences of IRInstructionData that would +/// create the same hash, we have similar IRSimilarityCandidates. +/// +/// We can also compare the structure of IRSimilarityCandidates. If we can +/// create a mapping of registers in the region contained by one +/// IRSimilarityCandidate to the region contained by different +/// IRSimilarityCandidate, they can be considered structurally similar. +/// +/// IRSimilarityCandidate1: IRSimilarityCandidate2: +/// %add1 = add i32 %a, %b %add1 = add i32 %d, %e +/// %add2 = add i32 %a, %c %add2 = add i32 %d, %f +/// %add3 = add i32 c1, c2 %add3 = add i32 c3, c4 +/// +/// Can have the following mapping from candidate to candidate of: +/// %a -> %d, %b -> %e, %c -> %f, c1 -> c3, c2 -> c4 +/// and can be considered similar. +/// +/// IRSimilarityCandidate1: IRSimilarityCandidate2: +/// %add1 = add i32 %a, %b %add1 = add i32 %d, c4 +/// %add2 = add i32 %a, %c %add2 = add i32 %d, %f +/// %add3 = add i32 c1, c2 %add3 = add i32 c3, c4 +/// +/// We cannot create the same mapping since the use of c4 is not used in the +/// same way as %b or c2. +class IRSimilarityCandidate { +private: + /// The start index of this IRSimilarityCandidate in the instruction list. + unsigned StartIdx = 0; + + /// The number of instructions in this IRSimilarityCandidate. + unsigned Len = 0; + + /// The first instruction in this IRSimilarityCandidate. + IRInstructionData *FirstInst = nullptr; + + /// The last instruction in this IRSimilarityCandidate. + IRInstructionData *LastInst = nullptr; + + /// Global Value Numbering structures + /// @{ + /// Stores the mapping of the value to the number assigned to it in the + /// IRSimilarityCandidate. + DenseMap<Value *, unsigned> ValueToNumber; + /// Stores the mapping of the number to the value assigned this number. + DenseMap<unsigned, Value *> NumberToValue; + /// @} + +public: + /// \param StartIdx - The starting location of the region. + /// \param Len - The length of the region. + /// \param FirstInstIt - The starting IRInstructionData of the region. + /// \param LastInstIt - The ending IRInstructionData of the region. + IRSimilarityCandidate(unsigned StartIdx, unsigned Len, + IRInstructionData *FirstInstIt, + IRInstructionData *LastInstIt); + + /// \param A - The first IRInstructionCandidate to compare. + /// \param B - The second IRInstructionCandidate to compare. + /// \returns True when every IRInstructionData in \p A is similar to every + /// IRInstructionData in \p B. + static bool isSimilar(const IRSimilarityCandidate &A, + const IRSimilarityCandidate &B); + + /// \param A - The first IRInstructionCandidate to compare. + /// \param B - The second IRInstructionCandidate to compare. + /// \returns True when every IRInstructionData in \p A is structurally similar + /// to \p B. + static bool compareStructure(const IRSimilarityCandidate &A, + const IRSimilarityCandidate &B); + + struct OperandMapping { + /// The IRSimilarityCandidate that holds the instruction the OperVals were + /// pulled from. + const IRSimilarityCandidate &IRSC; + + /// The operand values to be analyzed. + ArrayRef<Value *> &OperVals; + + /// The current mapping of global value numbers from one IRSimilarityCandidate + /// to another IRSimilarityCandidate. + DenseMap<unsigned, DenseSet<unsigned>> &ValueNumberMapping; + }; + + /// Compare the operands in \p A and \p B and check that the current mapping + /// of global value numbers from \p A to \p B and \p B to \A is consistent. + /// + /// \param A - The first IRInstructionCandidate, operand values, and current + /// operand mappings to compare. + /// \param B - The second IRInstructionCandidate, operand values, and current + /// operand mappings to compare. + /// \returns true if the IRSimilarityCandidates operands are compatible. + static bool compareNonCommutativeOperandMapping(OperandMapping A, + OperandMapping B); + + /// Compare the operands in \p A and \p B and check that the current mapping + /// of global value numbers from \p A to \p B and \p B to \A is consistent + /// given that the operands are commutative. + /// + /// \param A - The first IRInstructionCandidate, operand values, and current + /// operand mappings to compare. + /// \param B - The second IRInstructionCandidate, operand values, and current + /// operand mappings to compare. + /// \returns true if the IRSimilarityCandidates operands are compatible. + static bool compareCommutativeOperandMapping(OperandMapping A, + OperandMapping B); + + /// Compare the start and end indices of the two IRSimilarityCandidates for + /// whether they overlap. If the start instruction of one + /// IRSimilarityCandidate is less than the end instruction of the other, and + /// the start instruction of one is greater than the start instruction of the + /// other, they overlap. + /// + /// \returns true if the IRSimilarityCandidates do not have overlapping + /// instructions. + static bool overlap(const IRSimilarityCandidate &A, + const IRSimilarityCandidate &B); + + /// \returns the number of instructions in this Candidate. + unsigned getLength() const { return Len; } + + /// \returns the start index of this IRSimilarityCandidate. + unsigned getStartIdx() const { return StartIdx; } + + /// \returns the end index of this IRSimilarityCandidate. + unsigned getEndIdx() const { return StartIdx + Len - 1; } + + /// \returns The first IRInstructionData. + IRInstructionData *front() const { return FirstInst; } + /// \returns The last IRInstructionData. + IRInstructionData *back() const { return LastInst; } + + /// \returns The first Instruction. + Instruction *frontInstruction() { return FirstInst->Inst; } + /// \returns The last Instruction + Instruction *backInstruction() { return LastInst->Inst; } + + /// \returns The BasicBlock the IRSimilarityCandidate starts in. + BasicBlock *getStartBB() { return FirstInst->Inst->getParent(); } + /// \returns The BasicBlock the IRSimilarityCandidate ends in. + BasicBlock *getEndBB() { return LastInst->Inst->getParent(); } + + /// \returns The Function that the IRSimilarityCandidate is located in. + Function *getFunction() { return getStartBB()->getParent(); } + + /// Finds the positive number associated with \p V if it has been mapped. + /// \param [in] V - the Value to find. + /// \returns The positive number corresponding to the value. + /// \returns None if not present. + Optional<unsigned> getGVN(Value *V) { + assert(V != nullptr && "Value is a nullptr?"); + DenseMap<Value *, unsigned>::iterator VNIt = ValueToNumber.find(V); + if (VNIt == ValueToNumber.end()) + return None; + return VNIt->second; + } + + /// Finds the Value associate with \p Num if it exists. + /// \param [in] Num - the number to find. + /// \returns The Value associated with the number. + /// \returns None if not present. + Optional<Value *> fromGVN(unsigned Num) { + DenseMap<unsigned, Value *>::iterator VNIt = NumberToValue.find(Num); + if (VNIt == NumberToValue.end()) + return None; + assert(VNIt->second != nullptr && "Found value is a nullptr!"); + return VNIt->second; + } + + /// \param RHS -The IRSimilarityCandidate to compare against + /// \returns true if the IRSimilarityCandidate is occurs after the + /// IRSimilarityCandidate in the program. + bool operator<(const IRSimilarityCandidate &RHS) const { + return getStartIdx() > RHS.getStartIdx(); + } + + using iterator = IRInstructionDataList::iterator; + iterator begin() const { return iterator(front()); } + iterator end() const { return std::next(iterator(back())); } +}; + +typedef std::vector<IRSimilarityCandidate> SimilarityGroup; +typedef std::vector<SimilarityGroup> SimilarityGroupList; + +/// This class puts all the pieces of the IRInstructionData, +/// IRInstructionMapper, IRSimilarityCandidate together. +/// +/// It first feeds the Module or vector of Modules into the IRInstructionMapper, +/// and puts all the mapped instructions into a single long list of +/// IRInstructionData. +/// +/// The list of unsigned integers is given to the Suffix Tree or similar data +/// structure to find repeated subsequences. We construct an +/// IRSimilarityCandidate for each instance of the subsequence. We compare them +/// against one another since These repeated subsequences can have different +/// structure. For each different kind of structure found, we create a +/// similarity group. +/// +/// If we had four IRSimilarityCandidates A, B, C, and D where A, B and D are +/// structurally similar to one another, while C is different we would have two +/// SimilarityGroups: +/// +/// SimilarityGroup 1: SimilarityGroup 2 +/// A, B, D C +/// +/// A list of the different similarity groups is then returned after +/// analyzing the module. +class IRSimilarityIdentifier { +public: + IRSimilarityIdentifier() + : Mapper(&InstDataAllocator, &InstDataListAllocator) {} + + /// \param M the module to find similarity in. + explicit IRSimilarityIdentifier(Module &M) + : Mapper(&InstDataAllocator, &InstDataListAllocator) { + findSimilarity(M); + } + +private: + /// Map the instructions in the module to unsigned integers, using mapping + /// already present in the Mapper if possible. + /// + /// \param [in] M Module - To map to integers. + /// \param [in,out] InstrList - The vector to append IRInstructionData to. + /// \param [in,out] IntegerMapping - The vector to append integers to. + void populateMapper(Module &M, std::vector<IRInstructionData *> &InstrList, + std::vector<unsigned> &IntegerMapping); + + /// Map the instructions in the modules vector to unsigned integers, using + /// mapping already present in the mapper if possible. + /// + /// \param [in] Modules - The list of modules to use to populate the mapper + /// \param [in,out] InstrList - The vector to append IRInstructionData to. + /// \param [in,out] IntegerMapping - The vector to append integers to. + void populateMapper(ArrayRef<std::unique_ptr<Module>> &Modules, + std::vector<IRInstructionData *> &InstrList, + std::vector<unsigned> &IntegerMapping); + + /// Find the similarity candidates in \p InstrList and corresponding + /// \p UnsignedVec + /// + /// \param [in,out] InstrList - The vector to append IRInstructionData to. + /// \param [in,out] IntegerMapping - The vector to append integers to. + /// candidates found in the program. + void findCandidates(std::vector<IRInstructionData *> &InstrList, + std::vector<unsigned> &IntegerMapping); + +public: + // Find the IRSimilarityCandidates in the \p Modules and group by structural + // similarity in a SimilarityGroup, each group is returned in a + // SimilarityGroupList. + // + // \param [in] Modules - the modules to analyze. + // \returns The groups of similarity ranges found in the modules. + SimilarityGroupList & + findSimilarity(ArrayRef<std::unique_ptr<Module>> Modules); + + // Find the IRSimilarityCandidates in the given Module grouped by structural + // similarity in a SimilarityGroup, contained inside a SimilarityGroupList. + // + // \param [in] M - the module to analyze. + // \returns The groups of similarity ranges found in the module. + SimilarityGroupList &findSimilarity(Module &M); + + // Clears \ref SimilarityCandidates if it is already filled by a previous run. + void resetSimilarityCandidates() { + // If we've already analyzed a Module or set of Modules, so we must clear + // the SimilarityCandidates to make sure we do not have only old values + // hanging around. + if (SimilarityCandidates.hasValue()) + SimilarityCandidates->clear(); + else + SimilarityCandidates = SimilarityGroupList(); + } + + // \returns The groups of similarity ranges found in the most recently passed + // set of modules. + Optional<SimilarityGroupList> &getSimilarity() { + return SimilarityCandidates; + } + +private: + /// The allocator for IRInstructionData. + SpecificBumpPtrAllocator<IRInstructionData> InstDataAllocator; + + /// The allocator for IRInstructionDataLists. + SpecificBumpPtrAllocator<IRInstructionDataList> InstDataListAllocator; + + /// Map Instructions to unsigned integers and wraps the Instruction in an + /// instance of IRInstructionData. + IRInstructionMapper Mapper; + + /// The SimilarityGroups found with the most recent run of \ref + /// findSimilarity. None if there is no recent run. + Optional<SimilarityGroupList> SimilarityCandidates; +}; + +} // end namespace IRSimilarity + +/// An analysis pass based on legacy pass manager that runs and returns +/// IRSimilarityIdentifier run on the Module. +class IRSimilarityIdentifierWrapperPass : public ModulePass { + std::unique_ptr<IRSimilarity::IRSimilarityIdentifier> IRSI; + +public: + static char ID; + IRSimilarityIdentifierWrapperPass(); + + IRSimilarity::IRSimilarityIdentifier &getIRSI() { return *IRSI; } + const IRSimilarity::IRSimilarityIdentifier &getIRSI() const { return *IRSI; } + + bool doInitialization(Module &M) override; + bool doFinalization(Module &M) override; + bool runOnModule(Module &M) override; + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesAll(); + } +}; + +/// An analysis pass that runs and returns the IRSimilarityIdentifier run on the +/// Module. +class IRSimilarityAnalysis : public AnalysisInfoMixin<IRSimilarityAnalysis> { +public: + typedef IRSimilarity::IRSimilarityIdentifier Result; + + Result run(Module &M, ModuleAnalysisManager &); + +private: + friend AnalysisInfoMixin<IRSimilarityAnalysis>; + static AnalysisKey Key; +}; + +/// Printer pass that uses \c IRSimilarityAnalysis. +class IRSimilarityAnalysisPrinterPass + : public PassInfoMixin<IRSimilarityAnalysisPrinterPass> { + raw_ostream &OS; + +public: + explicit IRSimilarityAnalysisPrinterPass(raw_ostream &OS) : OS(OS) {} + PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM); +}; + +} // end namespace llvm + +#endif // LLVM_ANALYSIS_IRSIMILARITYIDENTIFIER_H + +#ifdef __GNUC__ +#pragma GCC diagnostic pop +#endif |