aboutsummaryrefslogtreecommitdiffstats
path: root/contrib/libs/llvm12/include/llvm/Analysis/IRSimilarityIdentifier.h
diff options
context:
space:
mode:
authorshadchin <shadchin@yandex-team.ru>2022-02-10 16:44:39 +0300
committerDaniil Cherednik <dcherednik@yandex-team.ru>2022-02-10 16:44:39 +0300
commite9656aae26e0358d5378e5b63dcac5c8dbe0e4d0 (patch)
tree64175d5cadab313b3e7039ebaa06c5bc3295e274 /contrib/libs/llvm12/include/llvm/Analysis/IRSimilarityIdentifier.h
parent2598ef1d0aee359b4b6d5fdd1758916d5907d04f (diff)
downloadydb-e9656aae26e0358d5378e5b63dcac5c8dbe0e4d0.tar.gz
Restoring authorship annotation for <shadchin@yandex-team.ru>. Commit 2 of 2.
Diffstat (limited to 'contrib/libs/llvm12/include/llvm/Analysis/IRSimilarityIdentifier.h')
-rw-r--r--contrib/libs/llvm12/include/llvm/Analysis/IRSimilarityIdentifier.h1600
1 files changed, 800 insertions, 800 deletions
diff --git a/contrib/libs/llvm12/include/llvm/Analysis/IRSimilarityIdentifier.h b/contrib/libs/llvm12/include/llvm/Analysis/IRSimilarityIdentifier.h
index 4544bfc8b9..6c765c5c00 100644
--- a/contrib/libs/llvm12/include/llvm/Analysis/IRSimilarityIdentifier.h
+++ b/contrib/libs/llvm12/include/llvm/Analysis/IRSimilarityIdentifier.h
@@ -1,800 +1,800 @@
-#pragma once
-
-#ifdef __GNUC__
-#pragma GCC diagnostic push
-#pragma GCC diagnostic ignored "-Wunused-parameter"
-#endif
-
-//===- IRSimilarityIdentifier.h - Find similarity in a module --------------==//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// \file
-// Interface file for the IRSimilarityIdentifier for identifying similarities in
-// IR including the IRInstructionMapper, which maps an Instruction to unsigned
-// integers.
-//
-// Two sequences of instructions are called "similar" if they perform the same
-// series of operations for all inputs.
-//
-// \code
-// %1 = add i32 %a, 10
-// %2 = add i32 %a, %1
-// %3 = icmp slt icmp %1, %2
-// \endcode
-//
-// and
-//
-// \code
-// %1 = add i32 11, %a
-// %2 = sub i32 %a, %1
-// %3 = icmp sgt icmp %2, %1
-// \endcode
-//
-// ultimately have the same result, even if the inputs, and structure are
-// slightly different.
-//
-// For instructions, we do not worry about operands that do not have fixed
-// semantic meaning to the program. We consider the opcode that the instruction
-// has, the types, parameters, and extra information such as the function name,
-// or comparison predicate. These are used to create a hash to map instructions
-// to integers to be used in similarity matching in sequences of instructions
-//
-// Terminology:
-// An IRSimilarityCandidate is a region of IRInstructionData (wrapped
-// Instructions), usually used to denote a region of similarity has been found.
-//
-// A SimilarityGroup is a set of IRSimilarityCandidates that are structurally
-// similar to one another.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_ANALYSIS_IRSIMILARITYIDENTIFIER_H
-#define LLVM_ANALYSIS_IRSIMILARITYIDENTIFIER_H
-
-#include "llvm/IR/InstVisitor.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/IR/Module.h"
-#include "llvm/IR/PassManager.h"
-#include "llvm/Pass.h"
-#include "llvm/Support/Allocator.h"
-
-namespace llvm {
-namespace IRSimilarity {
-
-struct IRInstructionDataList;
-
-/// This represents what is and is not supported when finding similarity in
-/// Instructions.
-///
-/// Legal Instructions are considered when looking at similarity between
-/// Instructions.
-///
-/// Illegal Instructions cannot be considered when looking for similarity
-/// between Instructions. They act as boundaries between similarity regions.
-///
-/// Invisible Instructions are skipped over during analysis.
-// TODO: Shared with MachineOutliner
-enum InstrType { Legal, Illegal, Invisible };
-
-/// This provides the utilities for hashing an Instruction to an unsigned
-/// integer. Two IRInstructionDatas produce the same hash value when their
-/// underlying Instructions perform the same operation (even if they don't have
-/// the same input operands.)
-/// As a more concrete example, consider the following:
-///
-/// \code
-/// %add1 = add i32 %a, %b
-/// %add2 = add i32 %c, %d
-/// %add3 = add i64 %e, %f
-/// \endcode
-///
-// Then the IRInstructionData wrappers for these Instructions may be hashed like
-/// so:
-///
-/// \code
-/// ; These two adds have the same types and operand types, so they hash to the
-/// ; same number.
-/// %add1 = add i32 %a, %b ; Hash: 1
-/// %add2 = add i32 %c, %d ; Hash: 1
-/// ; This add produces an i64. This differentiates it from %add1 and %add2. So,
-/// ; it hashes to a different number.
-/// %add3 = add i64 %e, %f; Hash: 2
-/// \endcode
-///
-///
-/// This hashing scheme will be used to represent the program as a very long
-/// string. This string can then be placed in a data structure which can be used
-/// for similarity queries.
-///
-/// TODO: Handle types of Instructions which can be equal even with different
-/// operands. (E.g. comparisons with swapped predicates.)
-/// TODO: Handle CallInsts, which are only checked for function type
-/// by \ref isSameOperationAs.
-/// TODO: Handle GetElementPtrInsts, as some of the operands have to be the
-/// exact same, and some do not.
-struct IRInstructionData : ilist_node<IRInstructionData> {
-
- /// The source Instruction that is being wrapped.
- Instruction *Inst = nullptr;
- /// The values of the operands in the Instruction.
- SmallVector<Value *, 4> OperVals;
- /// The legality of the wrapped instruction. This is informed by InstrType,
- /// and is used when checking when two instructions are considered similar.
- /// If either instruction is not legal, the instructions are automatically not
- /// considered similar.
- bool Legal;
-
- /// This is only relevant if we are wrapping a CmpInst where we needed to
- /// change the predicate of a compare instruction from a greater than form
- /// to a less than form. It is None otherwise.
- Optional<CmpInst::Predicate> RevisedPredicate;
-
- /// Gather the information that is difficult to gather for an Instruction, or
- /// is changed. i.e. the operands of an Instruction and the Types of those
- /// operands. This extra information allows for similarity matching to make
- /// assertions that allow for more flexibility when checking for whether an
- /// Instruction performs the same operation.
- IRInstructionData(Instruction &I, bool Legality, IRInstructionDataList &IDL);
-
- /// Get the predicate that the compare instruction is using for hashing the
- /// instruction. the IRInstructionData must be wrapping a CmpInst.
- CmpInst::Predicate getPredicate() const;
-
- /// A function that swaps the predicates to their less than form if they are
- /// in a greater than form. Otherwise, the predicate is unchanged.
- ///
- /// \param CI - The comparison operation to find a consistent preidcate for.
- /// \return the consistent comparison predicate.
- static CmpInst::Predicate predicateForConsistency(CmpInst *CI);
-
- /// Hashes \p Value based on its opcode, types, and operand types.
- /// Two IRInstructionData instances produce the same hash when they perform
- /// the same operation.
- ///
- /// As a simple example, consider the following instructions.
- ///
- /// \code
- /// %add1 = add i32 %x1, %y1
- /// %add2 = add i32 %x2, %y2
- ///
- /// %sub = sub i32 %x1, %y1
- ///
- /// %add_i64 = add i64 %x2, %y2
- /// \endcode
- ///
- /// Because the first two adds operate the same types, and are performing the
- /// same action, they will be hashed to the same value.
- ///
- /// However, the subtraction instruction is not the same as an addition, and
- /// will be hashed to a different value.
- ///
- /// Finally, the last add has a different type compared to the first two add
- /// instructions, so it will also be hashed to a different value that any of
- /// the previous instructions.
- ///
- /// \param [in] ID - The IRInstructionData instance to be hashed.
- /// \returns A hash_value of the IRInstructionData.
- friend hash_code hash_value(const IRInstructionData &ID) {
- SmallVector<Type *, 4> OperTypes;
- for (Value *V : ID.OperVals)
- OperTypes.push_back(V->getType());
-
- if (isa<CmpInst>(ID.Inst))
- return llvm::hash_combine(
- llvm::hash_value(ID.Inst->getOpcode()),
- llvm::hash_value(ID.Inst->getType()),
- llvm::hash_value(ID.getPredicate()),
- llvm::hash_combine_range(OperTypes.begin(), OperTypes.end()));
- else if (CallInst *CI = dyn_cast<CallInst>(ID.Inst))
- return llvm::hash_combine(
- llvm::hash_value(ID.Inst->getOpcode()),
- llvm::hash_value(ID.Inst->getType()),
- llvm::hash_value(CI->getCalledFunction()->getName().str()),
- llvm::hash_combine_range(OperTypes.begin(), OperTypes.end()));
- return llvm::hash_combine(
- llvm::hash_value(ID.Inst->getOpcode()),
- llvm::hash_value(ID.Inst->getType()),
- llvm::hash_combine_range(OperTypes.begin(), OperTypes.end()));
- }
-
- IRInstructionDataList *IDL = nullptr;
-};
-
-struct IRInstructionDataList : simple_ilist<IRInstructionData> {};
-
-/// Compare one IRInstructionData class to another IRInstructionData class for
-/// whether they are performing a the same operation, and can mapped to the
-/// same value. For regular instructions if the hash value is the same, then
-/// they will also be close.
-///
-/// \param A - The first IRInstructionData class to compare
-/// \param B - The second IRInstructionData class to compare
-/// \returns true if \p A and \p B are similar enough to be mapped to the same
-/// value.
-bool isClose(const IRInstructionData &A, const IRInstructionData &B);
-
-struct IRInstructionDataTraits : DenseMapInfo<IRInstructionData *> {
- static inline IRInstructionData *getEmptyKey() { return nullptr; }
- static inline IRInstructionData *getTombstoneKey() {
- return reinterpret_cast<IRInstructionData *>(-1);
- }
-
- static unsigned getHashValue(const IRInstructionData *E) {
- using llvm::hash_value;
- assert(E && "IRInstructionData is a nullptr?");
- return hash_value(*E);
- }
-
- static bool isEqual(const IRInstructionData *LHS,
- const IRInstructionData *RHS) {
- if (RHS == getEmptyKey() || RHS == getTombstoneKey() ||
- LHS == getEmptyKey() || LHS == getTombstoneKey())
- return LHS == RHS;
-
- assert(LHS && RHS && "nullptr should have been caught by getEmptyKey?");
- return isClose(*LHS, *RHS);
- }
-};
-
-/// Helper struct for converting the Instructions in a Module into a vector of
-/// unsigned integers. This vector of unsigned integers can be thought of as a
-/// "numeric string". This numeric string can then be queried by, for example,
-/// data structures that find repeated substrings.
-///
-/// This hashing is done per BasicBlock in the module. To hash Instructions
-/// based off of their operations, each Instruction is wrapped in an
-/// IRInstructionData struct. The unsigned integer for an IRInstructionData
-/// depends on:
-/// - The hash provided by the IRInstructionData.
-/// - Which member of InstrType the IRInstructionData is classified as.
-// See InstrType for more details on the possible classifications, and how they
-// manifest in the numeric string.
-///
-/// The numeric string for an individual BasicBlock is terminated by an unique
-/// unsigned integer. This prevents data structures which rely on repetition
-/// from matching across BasicBlocks. (For example, the SuffixTree.)
-/// As a concrete example, if we have the following two BasicBlocks:
-/// \code
-/// bb0:
-/// %add1 = add i32 %a, %b
-/// %add2 = add i32 %c, %d
-/// %add3 = add i64 %e, %f
-/// bb1:
-/// %sub = sub i32 %c, %d
-/// \endcode
-/// We may hash the Instructions like this (via IRInstructionData):
-/// \code
-/// bb0:
-/// %add1 = add i32 %a, %b ; Hash: 1
-/// %add2 = add i32 %c, %d; Hash: 1
-/// %add3 = add i64 %e, %f; Hash: 2
-/// bb1:
-/// %sub = sub i32 %c, %d; Hash: 3
-/// %add4 = add i32 %c, %d ; Hash: 1
-/// \endcode
-/// And produce a "numeric string representation" like so:
-/// 1, 1, 2, unique_integer_1, 3, 1, unique_integer_2
-///
-/// TODO: This is very similar to the MachineOutliner, and should be
-/// consolidated into the same interface.
-struct IRInstructionMapper {
- /// The starting illegal instruction number to map to.
- ///
- /// Set to -3 for compatibility with DenseMapInfo<unsigned>.
- unsigned IllegalInstrNumber = static_cast<unsigned>(-3);
-
- /// The next available integer to assign to a legal Instruction to.
- unsigned LegalInstrNumber = 0;
-
- /// Correspondence from IRInstructionData to unsigned integers.
- DenseMap<IRInstructionData *, unsigned, IRInstructionDataTraits>
- InstructionIntegerMap;
-
- /// Set if we added an illegal number in the previous step.
- /// Since each illegal number is unique, we only need one of them between
- /// each range of legal numbers. This lets us make sure we don't add more
- /// than one illegal number per range.
- bool AddedIllegalLastTime = false;
-
- /// Marks whether we found a illegal instruction in the previous step.
- bool CanCombineWithPrevInstr = false;
-
- /// Marks whether we have found a set of instructions that is long enough
- /// to be considered for similarity.
- bool HaveLegalRange = false;
-
- /// This allocator pointer is in charge of holding on to the IRInstructionData
- /// so it is not deallocated until whatever external tool is using it is done
- /// with the information.
- SpecificBumpPtrAllocator<IRInstructionData> *InstDataAllocator = nullptr;
-
- /// This allocator pointer is in charge of creating the IRInstructionDataList
- /// so it is not deallocated until whatever external tool is using it is done
- /// with the information.
- SpecificBumpPtrAllocator<IRInstructionDataList> *IDLAllocator = nullptr;
-
- /// Get an allocated IRInstructionData struct using the InstDataAllocator.
- ///
- /// \param I - The Instruction to wrap with IRInstructionData.
- /// \param Legality - A boolean value that is true if the instruction is to
- /// be considered for similarity, and false if not.
- /// \param IDL - The InstructionDataList that the IRInstructionData is
- /// inserted into.
- /// \returns An allocated IRInstructionData struct.
- IRInstructionData *allocateIRInstructionData(Instruction &I, bool Legality,
- IRInstructionDataList &IDL);
-
- /// Get an allocated IRInstructionDataList object using the IDLAllocator.
- ///
- /// \returns An allocated IRInstructionDataList object.
- IRInstructionDataList *allocateIRInstructionDataList();
-
- IRInstructionDataList *IDL = nullptr;
-
- /// Maps the Instructions in a BasicBlock \p BB to legal or illegal integers
- /// determined by \p InstrType. Two Instructions are mapped to the same value
- /// if they are close as defined by the InstructionData class above.
- ///
- /// \param [in] BB - The BasicBlock to be mapped to integers.
- /// \param [in,out] InstrList - Vector of IRInstructionData to append to.
- /// \param [in,out] IntegerMapping - Vector of unsigned integers to append to.
- void convertToUnsignedVec(BasicBlock &BB,
- std::vector<IRInstructionData *> &InstrList,
- std::vector<unsigned> &IntegerMapping);
-
- /// Maps an Instruction to a legal integer.
- ///
- /// \param [in] It - The Instruction to be mapped to an integer.
- /// \param [in,out] IntegerMappingForBB - Vector of unsigned integers to
- /// append to.
- /// \param [in,out] InstrListForBB - Vector of InstructionData to append to.
- /// \returns The integer \p It was mapped to.
- unsigned mapToLegalUnsigned(BasicBlock::iterator &It,
- std::vector<unsigned> &IntegerMappingForBB,
- std::vector<IRInstructionData *> &InstrListForBB);
-
- /// Maps an Instruction to an illegal integer.
- ///
- /// \param [in] It - The \p Instruction to be mapped to an integer.
- /// \param [in,out] IntegerMappingForBB - Vector of unsigned integers to
- /// append to.
- /// \param [in,out] InstrListForBB - Vector of IRInstructionData to append to.
- /// \param End - true if creating a dummy IRInstructionData at the end of a
- /// basic block.
- /// \returns The integer \p It was mapped to.
- unsigned mapToIllegalUnsigned(
- BasicBlock::iterator &It, std::vector<unsigned> &IntegerMappingForBB,
- std::vector<IRInstructionData *> &InstrListForBB, bool End = false);
-
- IRInstructionMapper(SpecificBumpPtrAllocator<IRInstructionData> *IDA,
- SpecificBumpPtrAllocator<IRInstructionDataList> *IDLA)
- : InstDataAllocator(IDA), IDLAllocator(IDLA) {
- // Make sure that the implementation of DenseMapInfo<unsigned> hasn't
- // changed.
- assert(DenseMapInfo<unsigned>::getEmptyKey() == static_cast<unsigned>(-1) &&
- "DenseMapInfo<unsigned>'s empty key isn't -1!");
- assert(DenseMapInfo<unsigned>::getTombstoneKey() ==
- static_cast<unsigned>(-2) &&
- "DenseMapInfo<unsigned>'s tombstone key isn't -2!");
-
- IDL = new (IDLAllocator->Allocate())
- IRInstructionDataList();
- }
-
- /// Custom InstVisitor to classify different instructions for whether it can
- /// be analyzed for similarity.
- struct InstructionClassification
- : public InstVisitor<InstructionClassification, InstrType> {
- InstructionClassification() {}
-
- // TODO: Determine a scheme to resolve when the label is similar enough.
- InstrType visitBranchInst(BranchInst &BI) { return Illegal; }
- // TODO: Determine a scheme to resolve when the labels are similar enough.
- InstrType visitPHINode(PHINode &PN) { return Illegal; }
- // TODO: Handle allocas.
- InstrType visitAllocaInst(AllocaInst &AI) { return Illegal; }
- // We exclude variable argument instructions since variable arguments
- // requires extra checking of the argument list.
- InstrType visitVAArgInst(VAArgInst &VI) { return Illegal; }
- // We exclude all exception handling cases since they are so context
- // dependent.
- InstrType visitLandingPadInst(LandingPadInst &LPI) { return Illegal; }
- InstrType visitFuncletPadInst(FuncletPadInst &FPI) { return Illegal; }
- // DebugInfo should be included in the regions, but should not be
- // analyzed for similarity as it has no bearing on the outcome of the
- // program.
- InstrType visitDbgInfoIntrinsic(DbgInfoIntrinsic &DII) { return Invisible; }
- // TODO: Handle specific intrinsics.
- InstrType visitIntrinsicInst(IntrinsicInst &II) { return Illegal; }
- // We only allow call instructions where the function has a name and
- // is not an indirect call.
- InstrType visitCallInst(CallInst &CI) {
- Function *F = CI.getCalledFunction();
- if (!F || CI.isIndirectCall() || !F->hasName())
- return Illegal;
- return Legal;
- }
- // TODO: We do not current handle similarity that changes the control flow.
- InstrType visitInvokeInst(InvokeInst &II) { return Illegal; }
- // TODO: We do not current handle similarity that changes the control flow.
- InstrType visitCallBrInst(CallBrInst &CBI) { return Illegal; }
- // TODO: Handle interblock similarity.
- InstrType visitTerminator(Instruction &I) { return Illegal; }
- InstrType visitInstruction(Instruction &I) { return Legal; }
- };
-
- /// Maps an Instruction to a member of InstrType.
- InstructionClassification InstClassifier;
-};
-
-/// This is a class that wraps a range of IRInstructionData from one point to
-/// another in the vector of IRInstructionData, which is a region of the
-/// program. It is also responsible for defining the structure within this
-/// region of instructions.
-///
-/// The structure of a region is defined through a value numbering system
-/// assigned to each unique value in a region at the creation of the
-/// IRSimilarityCandidate.
-///
-/// For example, for each Instruction we add a mapping for each new
-/// value seen in that Instruction.
-/// IR: Mapping Added:
-/// %add1 = add i32 %a, c1 %add1 -> 3, %a -> 1, c1 -> 2
-/// %add2 = add i32 %a, %1 %add2 -> 4
-/// %add3 = add i32 c2, c1 %add3 -> 6, c2 -> 5
-///
-/// We can compare IRSimilarityCandidates against one another.
-/// The \ref isSimilar function compares each IRInstructionData against one
-/// another and if we have the same sequences of IRInstructionData that would
-/// create the same hash, we have similar IRSimilarityCandidates.
-///
-/// We can also compare the structure of IRSimilarityCandidates. If we can
-/// create a mapping of registers in the region contained by one
-/// IRSimilarityCandidate to the region contained by different
-/// IRSimilarityCandidate, they can be considered structurally similar.
-///
-/// IRSimilarityCandidate1: IRSimilarityCandidate2:
-/// %add1 = add i32 %a, %b %add1 = add i32 %d, %e
-/// %add2 = add i32 %a, %c %add2 = add i32 %d, %f
-/// %add3 = add i32 c1, c2 %add3 = add i32 c3, c4
-///
-/// Can have the following mapping from candidate to candidate of:
-/// %a -> %d, %b -> %e, %c -> %f, c1 -> c3, c2 -> c4
-/// and can be considered similar.
-///
-/// IRSimilarityCandidate1: IRSimilarityCandidate2:
-/// %add1 = add i32 %a, %b %add1 = add i32 %d, c4
-/// %add2 = add i32 %a, %c %add2 = add i32 %d, %f
-/// %add3 = add i32 c1, c2 %add3 = add i32 c3, c4
-///
-/// We cannot create the same mapping since the use of c4 is not used in the
-/// same way as %b or c2.
-class IRSimilarityCandidate {
-private:
- /// The start index of this IRSimilarityCandidate in the instruction list.
- unsigned StartIdx = 0;
-
- /// The number of instructions in this IRSimilarityCandidate.
- unsigned Len = 0;
-
- /// The first instruction in this IRSimilarityCandidate.
- IRInstructionData *FirstInst = nullptr;
-
- /// The last instruction in this IRSimilarityCandidate.
- IRInstructionData *LastInst = nullptr;
-
- /// Global Value Numbering structures
- /// @{
- /// Stores the mapping of the value to the number assigned to it in the
- /// IRSimilarityCandidate.
- DenseMap<Value *, unsigned> ValueToNumber;
- /// Stores the mapping of the number to the value assigned this number.
- DenseMap<unsigned, Value *> NumberToValue;
- /// @}
-
-public:
- /// \param StartIdx - The starting location of the region.
- /// \param Len - The length of the region.
- /// \param FirstInstIt - The starting IRInstructionData of the region.
- /// \param LastInstIt - The ending IRInstructionData of the region.
- IRSimilarityCandidate(unsigned StartIdx, unsigned Len,
- IRInstructionData *FirstInstIt,
- IRInstructionData *LastInstIt);
-
- /// \param A - The first IRInstructionCandidate to compare.
- /// \param B - The second IRInstructionCandidate to compare.
- /// \returns True when every IRInstructionData in \p A is similar to every
- /// IRInstructionData in \p B.
- static bool isSimilar(const IRSimilarityCandidate &A,
- const IRSimilarityCandidate &B);
-
- /// \param A - The first IRInstructionCandidate to compare.
- /// \param B - The second IRInstructionCandidate to compare.
- /// \returns True when every IRInstructionData in \p A is structurally similar
- /// to \p B.
- static bool compareStructure(const IRSimilarityCandidate &A,
- const IRSimilarityCandidate &B);
-
- struct OperandMapping {
- /// The IRSimilarityCandidate that holds the instruction the OperVals were
- /// pulled from.
- const IRSimilarityCandidate &IRSC;
-
- /// The operand values to be analyzed.
- ArrayRef<Value *> &OperVals;
-
- /// The current mapping of global value numbers from one IRSimilarityCandidate
- /// to another IRSimilarityCandidate.
- DenseMap<unsigned, DenseSet<unsigned>> &ValueNumberMapping;
- };
-
- /// Compare the operands in \p A and \p B and check that the current mapping
- /// of global value numbers from \p A to \p B and \p B to \A is consistent.
- ///
- /// \param A - The first IRInstructionCandidate, operand values, and current
- /// operand mappings to compare.
- /// \param B - The second IRInstructionCandidate, operand values, and current
- /// operand mappings to compare.
- /// \returns true if the IRSimilarityCandidates operands are compatible.
- static bool compareNonCommutativeOperandMapping(OperandMapping A,
- OperandMapping B);
-
- /// Compare the operands in \p A and \p B and check that the current mapping
- /// of global value numbers from \p A to \p B and \p B to \A is consistent
- /// given that the operands are commutative.
- ///
- /// \param A - The first IRInstructionCandidate, operand values, and current
- /// operand mappings to compare.
- /// \param B - The second IRInstructionCandidate, operand values, and current
- /// operand mappings to compare.
- /// \returns true if the IRSimilarityCandidates operands are compatible.
- static bool compareCommutativeOperandMapping(OperandMapping A,
- OperandMapping B);
-
- /// Compare the start and end indices of the two IRSimilarityCandidates for
- /// whether they overlap. If the start instruction of one
- /// IRSimilarityCandidate is less than the end instruction of the other, and
- /// the start instruction of one is greater than the start instruction of the
- /// other, they overlap.
- ///
- /// \returns true if the IRSimilarityCandidates do not have overlapping
- /// instructions.
- static bool overlap(const IRSimilarityCandidate &A,
- const IRSimilarityCandidate &B);
-
- /// \returns the number of instructions in this Candidate.
- unsigned getLength() const { return Len; }
-
- /// \returns the start index of this IRSimilarityCandidate.
- unsigned getStartIdx() const { return StartIdx; }
-
- /// \returns the end index of this IRSimilarityCandidate.
- unsigned getEndIdx() const { return StartIdx + Len - 1; }
-
- /// \returns The first IRInstructionData.
- IRInstructionData *front() const { return FirstInst; }
- /// \returns The last IRInstructionData.
- IRInstructionData *back() const { return LastInst; }
-
- /// \returns The first Instruction.
- Instruction *frontInstruction() { return FirstInst->Inst; }
- /// \returns The last Instruction
- Instruction *backInstruction() { return LastInst->Inst; }
-
- /// \returns The BasicBlock the IRSimilarityCandidate starts in.
- BasicBlock *getStartBB() { return FirstInst->Inst->getParent(); }
- /// \returns The BasicBlock the IRSimilarityCandidate ends in.
- BasicBlock *getEndBB() { return LastInst->Inst->getParent(); }
-
- /// \returns The Function that the IRSimilarityCandidate is located in.
- Function *getFunction() { return getStartBB()->getParent(); }
-
- /// Finds the positive number associated with \p V if it has been mapped.
- /// \param [in] V - the Value to find.
- /// \returns The positive number corresponding to the value.
- /// \returns None if not present.
- Optional<unsigned> getGVN(Value *V) {
- assert(V != nullptr && "Value is a nullptr?");
- DenseMap<Value *, unsigned>::iterator VNIt = ValueToNumber.find(V);
- if (VNIt == ValueToNumber.end())
- return None;
- return VNIt->second;
- }
-
- /// Finds the Value associate with \p Num if it exists.
- /// \param [in] Num - the number to find.
- /// \returns The Value associated with the number.
- /// \returns None if not present.
- Optional<Value *> fromGVN(unsigned Num) {
- DenseMap<unsigned, Value *>::iterator VNIt = NumberToValue.find(Num);
- if (VNIt == NumberToValue.end())
- return None;
- assert(VNIt->second != nullptr && "Found value is a nullptr!");
- return VNIt->second;
- }
-
- /// \param RHS -The IRSimilarityCandidate to compare against
- /// \returns true if the IRSimilarityCandidate is occurs after the
- /// IRSimilarityCandidate in the program.
- bool operator<(const IRSimilarityCandidate &RHS) const {
- return getStartIdx() > RHS.getStartIdx();
- }
-
- using iterator = IRInstructionDataList::iterator;
- iterator begin() const { return iterator(front()); }
- iterator end() const { return std::next(iterator(back())); }
-};
-
-typedef std::vector<IRSimilarityCandidate> SimilarityGroup;
-typedef std::vector<SimilarityGroup> SimilarityGroupList;
-
-/// This class puts all the pieces of the IRInstructionData,
-/// IRInstructionMapper, IRSimilarityCandidate together.
-///
-/// It first feeds the Module or vector of Modules into the IRInstructionMapper,
-/// and puts all the mapped instructions into a single long list of
-/// IRInstructionData.
-///
-/// The list of unsigned integers is given to the Suffix Tree or similar data
-/// structure to find repeated subsequences. We construct an
-/// IRSimilarityCandidate for each instance of the subsequence. We compare them
-/// against one another since These repeated subsequences can have different
-/// structure. For each different kind of structure found, we create a
-/// similarity group.
-///
-/// If we had four IRSimilarityCandidates A, B, C, and D where A, B and D are
-/// structurally similar to one another, while C is different we would have two
-/// SimilarityGroups:
-///
-/// SimilarityGroup 1: SimilarityGroup 2
-/// A, B, D C
-///
-/// A list of the different similarity groups is then returned after
-/// analyzing the module.
-class IRSimilarityIdentifier {
-public:
- IRSimilarityIdentifier()
- : Mapper(&InstDataAllocator, &InstDataListAllocator) {}
-
- /// \param M the module to find similarity in.
- explicit IRSimilarityIdentifier(Module &M)
- : Mapper(&InstDataAllocator, &InstDataListAllocator) {
- findSimilarity(M);
- }
-
-private:
- /// Map the instructions in the module to unsigned integers, using mapping
- /// already present in the Mapper if possible.
- ///
- /// \param [in] M Module - To map to integers.
- /// \param [in,out] InstrList - The vector to append IRInstructionData to.
- /// \param [in,out] IntegerMapping - The vector to append integers to.
- void populateMapper(Module &M, std::vector<IRInstructionData *> &InstrList,
- std::vector<unsigned> &IntegerMapping);
-
- /// Map the instructions in the modules vector to unsigned integers, using
- /// mapping already present in the mapper if possible.
- ///
- /// \param [in] Modules - The list of modules to use to populate the mapper
- /// \param [in,out] InstrList - The vector to append IRInstructionData to.
- /// \param [in,out] IntegerMapping - The vector to append integers to.
- void populateMapper(ArrayRef<std::unique_ptr<Module>> &Modules,
- std::vector<IRInstructionData *> &InstrList,
- std::vector<unsigned> &IntegerMapping);
-
- /// Find the similarity candidates in \p InstrList and corresponding
- /// \p UnsignedVec
- ///
- /// \param [in,out] InstrList - The vector to append IRInstructionData to.
- /// \param [in,out] IntegerMapping - The vector to append integers to.
- /// candidates found in the program.
- void findCandidates(std::vector<IRInstructionData *> &InstrList,
- std::vector<unsigned> &IntegerMapping);
-
-public:
- // Find the IRSimilarityCandidates in the \p Modules and group by structural
- // similarity in a SimilarityGroup, each group is returned in a
- // SimilarityGroupList.
- //
- // \param [in] Modules - the modules to analyze.
- // \returns The groups of similarity ranges found in the modules.
- SimilarityGroupList &
- findSimilarity(ArrayRef<std::unique_ptr<Module>> Modules);
-
- // Find the IRSimilarityCandidates in the given Module grouped by structural
- // similarity in a SimilarityGroup, contained inside a SimilarityGroupList.
- //
- // \param [in] M - the module to analyze.
- // \returns The groups of similarity ranges found in the module.
- SimilarityGroupList &findSimilarity(Module &M);
-
- // Clears \ref SimilarityCandidates if it is already filled by a previous run.
- void resetSimilarityCandidates() {
- // If we've already analyzed a Module or set of Modules, so we must clear
- // the SimilarityCandidates to make sure we do not have only old values
- // hanging around.
- if (SimilarityCandidates.hasValue())
- SimilarityCandidates->clear();
- else
- SimilarityCandidates = SimilarityGroupList();
- }
-
- // \returns The groups of similarity ranges found in the most recently passed
- // set of modules.
- Optional<SimilarityGroupList> &getSimilarity() {
- return SimilarityCandidates;
- }
-
-private:
- /// The allocator for IRInstructionData.
- SpecificBumpPtrAllocator<IRInstructionData> InstDataAllocator;
-
- /// The allocator for IRInstructionDataLists.
- SpecificBumpPtrAllocator<IRInstructionDataList> InstDataListAllocator;
-
- /// Map Instructions to unsigned integers and wraps the Instruction in an
- /// instance of IRInstructionData.
- IRInstructionMapper Mapper;
-
- /// The SimilarityGroups found with the most recent run of \ref
- /// findSimilarity. None if there is no recent run.
- Optional<SimilarityGroupList> SimilarityCandidates;
-};
-
-} // end namespace IRSimilarity
-
-/// An analysis pass based on legacy pass manager that runs and returns
-/// IRSimilarityIdentifier run on the Module.
-class IRSimilarityIdentifierWrapperPass : public ModulePass {
- std::unique_ptr<IRSimilarity::IRSimilarityIdentifier> IRSI;
-
-public:
- static char ID;
- IRSimilarityIdentifierWrapperPass();
-
- IRSimilarity::IRSimilarityIdentifier &getIRSI() { return *IRSI; }
- const IRSimilarity::IRSimilarityIdentifier &getIRSI() const { return *IRSI; }
-
- bool doInitialization(Module &M) override;
- bool doFinalization(Module &M) override;
- bool runOnModule(Module &M) override;
- void getAnalysisUsage(AnalysisUsage &AU) const override {
- AU.setPreservesAll();
- }
-};
-
-/// An analysis pass that runs and returns the IRSimilarityIdentifier run on the
-/// Module.
-class IRSimilarityAnalysis : public AnalysisInfoMixin<IRSimilarityAnalysis> {
-public:
- typedef IRSimilarity::IRSimilarityIdentifier Result;
-
- Result run(Module &M, ModuleAnalysisManager &);
-
-private:
- friend AnalysisInfoMixin<IRSimilarityAnalysis>;
- static AnalysisKey Key;
-};
-
-/// Printer pass that uses \c IRSimilarityAnalysis.
-class IRSimilarityAnalysisPrinterPass
- : public PassInfoMixin<IRSimilarityAnalysisPrinterPass> {
- raw_ostream &OS;
-
-public:
- explicit IRSimilarityAnalysisPrinterPass(raw_ostream &OS) : OS(OS) {}
- PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM);
-};
-
-} // end namespace llvm
-
-#endif // LLVM_ANALYSIS_IRSIMILARITYIDENTIFIER_H
-
-#ifdef __GNUC__
-#pragma GCC diagnostic pop
-#endif
+#pragma once
+
+#ifdef __GNUC__
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wunused-parameter"
+#endif
+
+//===- IRSimilarityIdentifier.h - Find similarity in a module --------------==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// \file
+// Interface file for the IRSimilarityIdentifier for identifying similarities in
+// IR including the IRInstructionMapper, which maps an Instruction to unsigned
+// integers.
+//
+// Two sequences of instructions are called "similar" if they perform the same
+// series of operations for all inputs.
+//
+// \code
+// %1 = add i32 %a, 10
+// %2 = add i32 %a, %1
+// %3 = icmp slt icmp %1, %2
+// \endcode
+//
+// and
+//
+// \code
+// %1 = add i32 11, %a
+// %2 = sub i32 %a, %1
+// %3 = icmp sgt icmp %2, %1
+// \endcode
+//
+// ultimately have the same result, even if the inputs, and structure are
+// slightly different.
+//
+// For instructions, we do not worry about operands that do not have fixed
+// semantic meaning to the program. We consider the opcode that the instruction
+// has, the types, parameters, and extra information such as the function name,
+// or comparison predicate. These are used to create a hash to map instructions
+// to integers to be used in similarity matching in sequences of instructions
+//
+// Terminology:
+// An IRSimilarityCandidate is a region of IRInstructionData (wrapped
+// Instructions), usually used to denote a region of similarity has been found.
+//
+// A SimilarityGroup is a set of IRSimilarityCandidates that are structurally
+// similar to one another.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_ANALYSIS_IRSIMILARITYIDENTIFIER_H
+#define LLVM_ANALYSIS_IRSIMILARITYIDENTIFIER_H
+
+#include "llvm/IR/InstVisitor.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/PassManager.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Allocator.h"
+
+namespace llvm {
+namespace IRSimilarity {
+
+struct IRInstructionDataList;
+
+/// This represents what is and is not supported when finding similarity in
+/// Instructions.
+///
+/// Legal Instructions are considered when looking at similarity between
+/// Instructions.
+///
+/// Illegal Instructions cannot be considered when looking for similarity
+/// between Instructions. They act as boundaries between similarity regions.
+///
+/// Invisible Instructions are skipped over during analysis.
+// TODO: Shared with MachineOutliner
+enum InstrType { Legal, Illegal, Invisible };
+
+/// This provides the utilities for hashing an Instruction to an unsigned
+/// integer. Two IRInstructionDatas produce the same hash value when their
+/// underlying Instructions perform the same operation (even if they don't have
+/// the same input operands.)
+/// As a more concrete example, consider the following:
+///
+/// \code
+/// %add1 = add i32 %a, %b
+/// %add2 = add i32 %c, %d
+/// %add3 = add i64 %e, %f
+/// \endcode
+///
+// Then the IRInstructionData wrappers for these Instructions may be hashed like
+/// so:
+///
+/// \code
+/// ; These two adds have the same types and operand types, so they hash to the
+/// ; same number.
+/// %add1 = add i32 %a, %b ; Hash: 1
+/// %add2 = add i32 %c, %d ; Hash: 1
+/// ; This add produces an i64. This differentiates it from %add1 and %add2. So,
+/// ; it hashes to a different number.
+/// %add3 = add i64 %e, %f; Hash: 2
+/// \endcode
+///
+///
+/// This hashing scheme will be used to represent the program as a very long
+/// string. This string can then be placed in a data structure which can be used
+/// for similarity queries.
+///
+/// TODO: Handle types of Instructions which can be equal even with different
+/// operands. (E.g. comparisons with swapped predicates.)
+/// TODO: Handle CallInsts, which are only checked for function type
+/// by \ref isSameOperationAs.
+/// TODO: Handle GetElementPtrInsts, as some of the operands have to be the
+/// exact same, and some do not.
+struct IRInstructionData : ilist_node<IRInstructionData> {
+
+ /// The source Instruction that is being wrapped.
+ Instruction *Inst = nullptr;
+ /// The values of the operands in the Instruction.
+ SmallVector<Value *, 4> OperVals;
+ /// The legality of the wrapped instruction. This is informed by InstrType,
+ /// and is used when checking when two instructions are considered similar.
+ /// If either instruction is not legal, the instructions are automatically not
+ /// considered similar.
+ bool Legal;
+
+ /// This is only relevant if we are wrapping a CmpInst where we needed to
+ /// change the predicate of a compare instruction from a greater than form
+ /// to a less than form. It is None otherwise.
+ Optional<CmpInst::Predicate> RevisedPredicate;
+
+ /// Gather the information that is difficult to gather for an Instruction, or
+ /// is changed. i.e. the operands of an Instruction and the Types of those
+ /// operands. This extra information allows for similarity matching to make
+ /// assertions that allow for more flexibility when checking for whether an
+ /// Instruction performs the same operation.
+ IRInstructionData(Instruction &I, bool Legality, IRInstructionDataList &IDL);
+
+ /// Get the predicate that the compare instruction is using for hashing the
+ /// instruction. the IRInstructionData must be wrapping a CmpInst.
+ CmpInst::Predicate getPredicate() const;
+
+ /// A function that swaps the predicates to their less than form if they are
+ /// in a greater than form. Otherwise, the predicate is unchanged.
+ ///
+ /// \param CI - The comparison operation to find a consistent preidcate for.
+ /// \return the consistent comparison predicate.
+ static CmpInst::Predicate predicateForConsistency(CmpInst *CI);
+
+ /// Hashes \p Value based on its opcode, types, and operand types.
+ /// Two IRInstructionData instances produce the same hash when they perform
+ /// the same operation.
+ ///
+ /// As a simple example, consider the following instructions.
+ ///
+ /// \code
+ /// %add1 = add i32 %x1, %y1
+ /// %add2 = add i32 %x2, %y2
+ ///
+ /// %sub = sub i32 %x1, %y1
+ ///
+ /// %add_i64 = add i64 %x2, %y2
+ /// \endcode
+ ///
+ /// Because the first two adds operate the same types, and are performing the
+ /// same action, they will be hashed to the same value.
+ ///
+ /// However, the subtraction instruction is not the same as an addition, and
+ /// will be hashed to a different value.
+ ///
+ /// Finally, the last add has a different type compared to the first two add
+ /// instructions, so it will also be hashed to a different value that any of
+ /// the previous instructions.
+ ///
+ /// \param [in] ID - The IRInstructionData instance to be hashed.
+ /// \returns A hash_value of the IRInstructionData.
+ friend hash_code hash_value(const IRInstructionData &ID) {
+ SmallVector<Type *, 4> OperTypes;
+ for (Value *V : ID.OperVals)
+ OperTypes.push_back(V->getType());
+
+ if (isa<CmpInst>(ID.Inst))
+ return llvm::hash_combine(
+ llvm::hash_value(ID.Inst->getOpcode()),
+ llvm::hash_value(ID.Inst->getType()),
+ llvm::hash_value(ID.getPredicate()),
+ llvm::hash_combine_range(OperTypes.begin(), OperTypes.end()));
+ else if (CallInst *CI = dyn_cast<CallInst>(ID.Inst))
+ return llvm::hash_combine(
+ llvm::hash_value(ID.Inst->getOpcode()),
+ llvm::hash_value(ID.Inst->getType()),
+ llvm::hash_value(CI->getCalledFunction()->getName().str()),
+ llvm::hash_combine_range(OperTypes.begin(), OperTypes.end()));
+ return llvm::hash_combine(
+ llvm::hash_value(ID.Inst->getOpcode()),
+ llvm::hash_value(ID.Inst->getType()),
+ llvm::hash_combine_range(OperTypes.begin(), OperTypes.end()));
+ }
+
+ IRInstructionDataList *IDL = nullptr;
+};
+
+struct IRInstructionDataList : simple_ilist<IRInstructionData> {};
+
+/// Compare one IRInstructionData class to another IRInstructionData class for
+/// whether they are performing a the same operation, and can mapped to the
+/// same value. For regular instructions if the hash value is the same, then
+/// they will also be close.
+///
+/// \param A - The first IRInstructionData class to compare
+/// \param B - The second IRInstructionData class to compare
+/// \returns true if \p A and \p B are similar enough to be mapped to the same
+/// value.
+bool isClose(const IRInstructionData &A, const IRInstructionData &B);
+
+struct IRInstructionDataTraits : DenseMapInfo<IRInstructionData *> {
+ static inline IRInstructionData *getEmptyKey() { return nullptr; }
+ static inline IRInstructionData *getTombstoneKey() {
+ return reinterpret_cast<IRInstructionData *>(-1);
+ }
+
+ static unsigned getHashValue(const IRInstructionData *E) {
+ using llvm::hash_value;
+ assert(E && "IRInstructionData is a nullptr?");
+ return hash_value(*E);
+ }
+
+ static bool isEqual(const IRInstructionData *LHS,
+ const IRInstructionData *RHS) {
+ if (RHS == getEmptyKey() || RHS == getTombstoneKey() ||
+ LHS == getEmptyKey() || LHS == getTombstoneKey())
+ return LHS == RHS;
+
+ assert(LHS && RHS && "nullptr should have been caught by getEmptyKey?");
+ return isClose(*LHS, *RHS);
+ }
+};
+
+/// Helper struct for converting the Instructions in a Module into a vector of
+/// unsigned integers. This vector of unsigned integers can be thought of as a
+/// "numeric string". This numeric string can then be queried by, for example,
+/// data structures that find repeated substrings.
+///
+/// This hashing is done per BasicBlock in the module. To hash Instructions
+/// based off of their operations, each Instruction is wrapped in an
+/// IRInstructionData struct. The unsigned integer for an IRInstructionData
+/// depends on:
+/// - The hash provided by the IRInstructionData.
+/// - Which member of InstrType the IRInstructionData is classified as.
+// See InstrType for more details on the possible classifications, and how they
+// manifest in the numeric string.
+///
+/// The numeric string for an individual BasicBlock is terminated by an unique
+/// unsigned integer. This prevents data structures which rely on repetition
+/// from matching across BasicBlocks. (For example, the SuffixTree.)
+/// As a concrete example, if we have the following two BasicBlocks:
+/// \code
+/// bb0:
+/// %add1 = add i32 %a, %b
+/// %add2 = add i32 %c, %d
+/// %add3 = add i64 %e, %f
+/// bb1:
+/// %sub = sub i32 %c, %d
+/// \endcode
+/// We may hash the Instructions like this (via IRInstructionData):
+/// \code
+/// bb0:
+/// %add1 = add i32 %a, %b ; Hash: 1
+/// %add2 = add i32 %c, %d; Hash: 1
+/// %add3 = add i64 %e, %f; Hash: 2
+/// bb1:
+/// %sub = sub i32 %c, %d; Hash: 3
+/// %add4 = add i32 %c, %d ; Hash: 1
+/// \endcode
+/// And produce a "numeric string representation" like so:
+/// 1, 1, 2, unique_integer_1, 3, 1, unique_integer_2
+///
+/// TODO: This is very similar to the MachineOutliner, and should be
+/// consolidated into the same interface.
+struct IRInstructionMapper {
+ /// The starting illegal instruction number to map to.
+ ///
+ /// Set to -3 for compatibility with DenseMapInfo<unsigned>.
+ unsigned IllegalInstrNumber = static_cast<unsigned>(-3);
+
+ /// The next available integer to assign to a legal Instruction to.
+ unsigned LegalInstrNumber = 0;
+
+ /// Correspondence from IRInstructionData to unsigned integers.
+ DenseMap<IRInstructionData *, unsigned, IRInstructionDataTraits>
+ InstructionIntegerMap;
+
+ /// Set if we added an illegal number in the previous step.
+ /// Since each illegal number is unique, we only need one of them between
+ /// each range of legal numbers. This lets us make sure we don't add more
+ /// than one illegal number per range.
+ bool AddedIllegalLastTime = false;
+
+ /// Marks whether we found a illegal instruction in the previous step.
+ bool CanCombineWithPrevInstr = false;
+
+ /// Marks whether we have found a set of instructions that is long enough
+ /// to be considered for similarity.
+ bool HaveLegalRange = false;
+
+ /// This allocator pointer is in charge of holding on to the IRInstructionData
+ /// so it is not deallocated until whatever external tool is using it is done
+ /// with the information.
+ SpecificBumpPtrAllocator<IRInstructionData> *InstDataAllocator = nullptr;
+
+ /// This allocator pointer is in charge of creating the IRInstructionDataList
+ /// so it is not deallocated until whatever external tool is using it is done
+ /// with the information.
+ SpecificBumpPtrAllocator<IRInstructionDataList> *IDLAllocator = nullptr;
+
+ /// Get an allocated IRInstructionData struct using the InstDataAllocator.
+ ///
+ /// \param I - The Instruction to wrap with IRInstructionData.
+ /// \param Legality - A boolean value that is true if the instruction is to
+ /// be considered for similarity, and false if not.
+ /// \param IDL - The InstructionDataList that the IRInstructionData is
+ /// inserted into.
+ /// \returns An allocated IRInstructionData struct.
+ IRInstructionData *allocateIRInstructionData(Instruction &I, bool Legality,
+ IRInstructionDataList &IDL);
+
+ /// Get an allocated IRInstructionDataList object using the IDLAllocator.
+ ///
+ /// \returns An allocated IRInstructionDataList object.
+ IRInstructionDataList *allocateIRInstructionDataList();
+
+ IRInstructionDataList *IDL = nullptr;
+
+ /// Maps the Instructions in a BasicBlock \p BB to legal or illegal integers
+ /// determined by \p InstrType. Two Instructions are mapped to the same value
+ /// if they are close as defined by the InstructionData class above.
+ ///
+ /// \param [in] BB - The BasicBlock to be mapped to integers.
+ /// \param [in,out] InstrList - Vector of IRInstructionData to append to.
+ /// \param [in,out] IntegerMapping - Vector of unsigned integers to append to.
+ void convertToUnsignedVec(BasicBlock &BB,
+ std::vector<IRInstructionData *> &InstrList,
+ std::vector<unsigned> &IntegerMapping);
+
+ /// Maps an Instruction to a legal integer.
+ ///
+ /// \param [in] It - The Instruction to be mapped to an integer.
+ /// \param [in,out] IntegerMappingForBB - Vector of unsigned integers to
+ /// append to.
+ /// \param [in,out] InstrListForBB - Vector of InstructionData to append to.
+ /// \returns The integer \p It was mapped to.
+ unsigned mapToLegalUnsigned(BasicBlock::iterator &It,
+ std::vector<unsigned> &IntegerMappingForBB,
+ std::vector<IRInstructionData *> &InstrListForBB);
+
+ /// Maps an Instruction to an illegal integer.
+ ///
+ /// \param [in] It - The \p Instruction to be mapped to an integer.
+ /// \param [in,out] IntegerMappingForBB - Vector of unsigned integers to
+ /// append to.
+ /// \param [in,out] InstrListForBB - Vector of IRInstructionData to append to.
+ /// \param End - true if creating a dummy IRInstructionData at the end of a
+ /// basic block.
+ /// \returns The integer \p It was mapped to.
+ unsigned mapToIllegalUnsigned(
+ BasicBlock::iterator &It, std::vector<unsigned> &IntegerMappingForBB,
+ std::vector<IRInstructionData *> &InstrListForBB, bool End = false);
+
+ IRInstructionMapper(SpecificBumpPtrAllocator<IRInstructionData> *IDA,
+ SpecificBumpPtrAllocator<IRInstructionDataList> *IDLA)
+ : InstDataAllocator(IDA), IDLAllocator(IDLA) {
+ // Make sure that the implementation of DenseMapInfo<unsigned> hasn't
+ // changed.
+ assert(DenseMapInfo<unsigned>::getEmptyKey() == static_cast<unsigned>(-1) &&
+ "DenseMapInfo<unsigned>'s empty key isn't -1!");
+ assert(DenseMapInfo<unsigned>::getTombstoneKey() ==
+ static_cast<unsigned>(-2) &&
+ "DenseMapInfo<unsigned>'s tombstone key isn't -2!");
+
+ IDL = new (IDLAllocator->Allocate())
+ IRInstructionDataList();
+ }
+
+ /// Custom InstVisitor to classify different instructions for whether it can
+ /// be analyzed for similarity.
+ struct InstructionClassification
+ : public InstVisitor<InstructionClassification, InstrType> {
+ InstructionClassification() {}
+
+ // TODO: Determine a scheme to resolve when the label is similar enough.
+ InstrType visitBranchInst(BranchInst &BI) { return Illegal; }
+ // TODO: Determine a scheme to resolve when the labels are similar enough.
+ InstrType visitPHINode(PHINode &PN) { return Illegal; }
+ // TODO: Handle allocas.
+ InstrType visitAllocaInst(AllocaInst &AI) { return Illegal; }
+ // We exclude variable argument instructions since variable arguments
+ // requires extra checking of the argument list.
+ InstrType visitVAArgInst(VAArgInst &VI) { return Illegal; }
+ // We exclude all exception handling cases since they are so context
+ // dependent.
+ InstrType visitLandingPadInst(LandingPadInst &LPI) { return Illegal; }
+ InstrType visitFuncletPadInst(FuncletPadInst &FPI) { return Illegal; }
+ // DebugInfo should be included in the regions, but should not be
+ // analyzed for similarity as it has no bearing on the outcome of the
+ // program.
+ InstrType visitDbgInfoIntrinsic(DbgInfoIntrinsic &DII) { return Invisible; }
+ // TODO: Handle specific intrinsics.
+ InstrType visitIntrinsicInst(IntrinsicInst &II) { return Illegal; }
+ // We only allow call instructions where the function has a name and
+ // is not an indirect call.
+ InstrType visitCallInst(CallInst &CI) {
+ Function *F = CI.getCalledFunction();
+ if (!F || CI.isIndirectCall() || !F->hasName())
+ return Illegal;
+ return Legal;
+ }
+ // TODO: We do not current handle similarity that changes the control flow.
+ InstrType visitInvokeInst(InvokeInst &II) { return Illegal; }
+ // TODO: We do not current handle similarity that changes the control flow.
+ InstrType visitCallBrInst(CallBrInst &CBI) { return Illegal; }
+ // TODO: Handle interblock similarity.
+ InstrType visitTerminator(Instruction &I) { return Illegal; }
+ InstrType visitInstruction(Instruction &I) { return Legal; }
+ };
+
+ /// Maps an Instruction to a member of InstrType.
+ InstructionClassification InstClassifier;
+};
+
+/// This is a class that wraps a range of IRInstructionData from one point to
+/// another in the vector of IRInstructionData, which is a region of the
+/// program. It is also responsible for defining the structure within this
+/// region of instructions.
+///
+/// The structure of a region is defined through a value numbering system
+/// assigned to each unique value in a region at the creation of the
+/// IRSimilarityCandidate.
+///
+/// For example, for each Instruction we add a mapping for each new
+/// value seen in that Instruction.
+/// IR: Mapping Added:
+/// %add1 = add i32 %a, c1 %add1 -> 3, %a -> 1, c1 -> 2
+/// %add2 = add i32 %a, %1 %add2 -> 4
+/// %add3 = add i32 c2, c1 %add3 -> 6, c2 -> 5
+///
+/// We can compare IRSimilarityCandidates against one another.
+/// The \ref isSimilar function compares each IRInstructionData against one
+/// another and if we have the same sequences of IRInstructionData that would
+/// create the same hash, we have similar IRSimilarityCandidates.
+///
+/// We can also compare the structure of IRSimilarityCandidates. If we can
+/// create a mapping of registers in the region contained by one
+/// IRSimilarityCandidate to the region contained by different
+/// IRSimilarityCandidate, they can be considered structurally similar.
+///
+/// IRSimilarityCandidate1: IRSimilarityCandidate2:
+/// %add1 = add i32 %a, %b %add1 = add i32 %d, %e
+/// %add2 = add i32 %a, %c %add2 = add i32 %d, %f
+/// %add3 = add i32 c1, c2 %add3 = add i32 c3, c4
+///
+/// Can have the following mapping from candidate to candidate of:
+/// %a -> %d, %b -> %e, %c -> %f, c1 -> c3, c2 -> c4
+/// and can be considered similar.
+///
+/// IRSimilarityCandidate1: IRSimilarityCandidate2:
+/// %add1 = add i32 %a, %b %add1 = add i32 %d, c4
+/// %add2 = add i32 %a, %c %add2 = add i32 %d, %f
+/// %add3 = add i32 c1, c2 %add3 = add i32 c3, c4
+///
+/// We cannot create the same mapping since the use of c4 is not used in the
+/// same way as %b or c2.
+class IRSimilarityCandidate {
+private:
+ /// The start index of this IRSimilarityCandidate in the instruction list.
+ unsigned StartIdx = 0;
+
+ /// The number of instructions in this IRSimilarityCandidate.
+ unsigned Len = 0;
+
+ /// The first instruction in this IRSimilarityCandidate.
+ IRInstructionData *FirstInst = nullptr;
+
+ /// The last instruction in this IRSimilarityCandidate.
+ IRInstructionData *LastInst = nullptr;
+
+ /// Global Value Numbering structures
+ /// @{
+ /// Stores the mapping of the value to the number assigned to it in the
+ /// IRSimilarityCandidate.
+ DenseMap<Value *, unsigned> ValueToNumber;
+ /// Stores the mapping of the number to the value assigned this number.
+ DenseMap<unsigned, Value *> NumberToValue;
+ /// @}
+
+public:
+ /// \param StartIdx - The starting location of the region.
+ /// \param Len - The length of the region.
+ /// \param FirstInstIt - The starting IRInstructionData of the region.
+ /// \param LastInstIt - The ending IRInstructionData of the region.
+ IRSimilarityCandidate(unsigned StartIdx, unsigned Len,
+ IRInstructionData *FirstInstIt,
+ IRInstructionData *LastInstIt);
+
+ /// \param A - The first IRInstructionCandidate to compare.
+ /// \param B - The second IRInstructionCandidate to compare.
+ /// \returns True when every IRInstructionData in \p A is similar to every
+ /// IRInstructionData in \p B.
+ static bool isSimilar(const IRSimilarityCandidate &A,
+ const IRSimilarityCandidate &B);
+
+ /// \param A - The first IRInstructionCandidate to compare.
+ /// \param B - The second IRInstructionCandidate to compare.
+ /// \returns True when every IRInstructionData in \p A is structurally similar
+ /// to \p B.
+ static bool compareStructure(const IRSimilarityCandidate &A,
+ const IRSimilarityCandidate &B);
+
+ struct OperandMapping {
+ /// The IRSimilarityCandidate that holds the instruction the OperVals were
+ /// pulled from.
+ const IRSimilarityCandidate &IRSC;
+
+ /// The operand values to be analyzed.
+ ArrayRef<Value *> &OperVals;
+
+ /// The current mapping of global value numbers from one IRSimilarityCandidate
+ /// to another IRSimilarityCandidate.
+ DenseMap<unsigned, DenseSet<unsigned>> &ValueNumberMapping;
+ };
+
+ /// Compare the operands in \p A and \p B and check that the current mapping
+ /// of global value numbers from \p A to \p B and \p B to \A is consistent.
+ ///
+ /// \param A - The first IRInstructionCandidate, operand values, and current
+ /// operand mappings to compare.
+ /// \param B - The second IRInstructionCandidate, operand values, and current
+ /// operand mappings to compare.
+ /// \returns true if the IRSimilarityCandidates operands are compatible.
+ static bool compareNonCommutativeOperandMapping(OperandMapping A,
+ OperandMapping B);
+
+ /// Compare the operands in \p A and \p B and check that the current mapping
+ /// of global value numbers from \p A to \p B and \p B to \A is consistent
+ /// given that the operands are commutative.
+ ///
+ /// \param A - The first IRInstructionCandidate, operand values, and current
+ /// operand mappings to compare.
+ /// \param B - The second IRInstructionCandidate, operand values, and current
+ /// operand mappings to compare.
+ /// \returns true if the IRSimilarityCandidates operands are compatible.
+ static bool compareCommutativeOperandMapping(OperandMapping A,
+ OperandMapping B);
+
+ /// Compare the start and end indices of the two IRSimilarityCandidates for
+ /// whether they overlap. If the start instruction of one
+ /// IRSimilarityCandidate is less than the end instruction of the other, and
+ /// the start instruction of one is greater than the start instruction of the
+ /// other, they overlap.
+ ///
+ /// \returns true if the IRSimilarityCandidates do not have overlapping
+ /// instructions.
+ static bool overlap(const IRSimilarityCandidate &A,
+ const IRSimilarityCandidate &B);
+
+ /// \returns the number of instructions in this Candidate.
+ unsigned getLength() const { return Len; }
+
+ /// \returns the start index of this IRSimilarityCandidate.
+ unsigned getStartIdx() const { return StartIdx; }
+
+ /// \returns the end index of this IRSimilarityCandidate.
+ unsigned getEndIdx() const { return StartIdx + Len - 1; }
+
+ /// \returns The first IRInstructionData.
+ IRInstructionData *front() const { return FirstInst; }
+ /// \returns The last IRInstructionData.
+ IRInstructionData *back() const { return LastInst; }
+
+ /// \returns The first Instruction.
+ Instruction *frontInstruction() { return FirstInst->Inst; }
+ /// \returns The last Instruction
+ Instruction *backInstruction() { return LastInst->Inst; }
+
+ /// \returns The BasicBlock the IRSimilarityCandidate starts in.
+ BasicBlock *getStartBB() { return FirstInst->Inst->getParent(); }
+ /// \returns The BasicBlock the IRSimilarityCandidate ends in.
+ BasicBlock *getEndBB() { return LastInst->Inst->getParent(); }
+
+ /// \returns The Function that the IRSimilarityCandidate is located in.
+ Function *getFunction() { return getStartBB()->getParent(); }
+
+ /// Finds the positive number associated with \p V if it has been mapped.
+ /// \param [in] V - the Value to find.
+ /// \returns The positive number corresponding to the value.
+ /// \returns None if not present.
+ Optional<unsigned> getGVN(Value *V) {
+ assert(V != nullptr && "Value is a nullptr?");
+ DenseMap<Value *, unsigned>::iterator VNIt = ValueToNumber.find(V);
+ if (VNIt == ValueToNumber.end())
+ return None;
+ return VNIt->second;
+ }
+
+ /// Finds the Value associate with \p Num if it exists.
+ /// \param [in] Num - the number to find.
+ /// \returns The Value associated with the number.
+ /// \returns None if not present.
+ Optional<Value *> fromGVN(unsigned Num) {
+ DenseMap<unsigned, Value *>::iterator VNIt = NumberToValue.find(Num);
+ if (VNIt == NumberToValue.end())
+ return None;
+ assert(VNIt->second != nullptr && "Found value is a nullptr!");
+ return VNIt->second;
+ }
+
+ /// \param RHS -The IRSimilarityCandidate to compare against
+ /// \returns true if the IRSimilarityCandidate is occurs after the
+ /// IRSimilarityCandidate in the program.
+ bool operator<(const IRSimilarityCandidate &RHS) const {
+ return getStartIdx() > RHS.getStartIdx();
+ }
+
+ using iterator = IRInstructionDataList::iterator;
+ iterator begin() const { return iterator(front()); }
+ iterator end() const { return std::next(iterator(back())); }
+};
+
+typedef std::vector<IRSimilarityCandidate> SimilarityGroup;
+typedef std::vector<SimilarityGroup> SimilarityGroupList;
+
+/// This class puts all the pieces of the IRInstructionData,
+/// IRInstructionMapper, IRSimilarityCandidate together.
+///
+/// It first feeds the Module or vector of Modules into the IRInstructionMapper,
+/// and puts all the mapped instructions into a single long list of
+/// IRInstructionData.
+///
+/// The list of unsigned integers is given to the Suffix Tree or similar data
+/// structure to find repeated subsequences. We construct an
+/// IRSimilarityCandidate for each instance of the subsequence. We compare them
+/// against one another since These repeated subsequences can have different
+/// structure. For each different kind of structure found, we create a
+/// similarity group.
+///
+/// If we had four IRSimilarityCandidates A, B, C, and D where A, B and D are
+/// structurally similar to one another, while C is different we would have two
+/// SimilarityGroups:
+///
+/// SimilarityGroup 1: SimilarityGroup 2
+/// A, B, D C
+///
+/// A list of the different similarity groups is then returned after
+/// analyzing the module.
+class IRSimilarityIdentifier {
+public:
+ IRSimilarityIdentifier()
+ : Mapper(&InstDataAllocator, &InstDataListAllocator) {}
+
+ /// \param M the module to find similarity in.
+ explicit IRSimilarityIdentifier(Module &M)
+ : Mapper(&InstDataAllocator, &InstDataListAllocator) {
+ findSimilarity(M);
+ }
+
+private:
+ /// Map the instructions in the module to unsigned integers, using mapping
+ /// already present in the Mapper if possible.
+ ///
+ /// \param [in] M Module - To map to integers.
+ /// \param [in,out] InstrList - The vector to append IRInstructionData to.
+ /// \param [in,out] IntegerMapping - The vector to append integers to.
+ void populateMapper(Module &M, std::vector<IRInstructionData *> &InstrList,
+ std::vector<unsigned> &IntegerMapping);
+
+ /// Map the instructions in the modules vector to unsigned integers, using
+ /// mapping already present in the mapper if possible.
+ ///
+ /// \param [in] Modules - The list of modules to use to populate the mapper
+ /// \param [in,out] InstrList - The vector to append IRInstructionData to.
+ /// \param [in,out] IntegerMapping - The vector to append integers to.
+ void populateMapper(ArrayRef<std::unique_ptr<Module>> &Modules,
+ std::vector<IRInstructionData *> &InstrList,
+ std::vector<unsigned> &IntegerMapping);
+
+ /// Find the similarity candidates in \p InstrList and corresponding
+ /// \p UnsignedVec
+ ///
+ /// \param [in,out] InstrList - The vector to append IRInstructionData to.
+ /// \param [in,out] IntegerMapping - The vector to append integers to.
+ /// candidates found in the program.
+ void findCandidates(std::vector<IRInstructionData *> &InstrList,
+ std::vector<unsigned> &IntegerMapping);
+
+public:
+ // Find the IRSimilarityCandidates in the \p Modules and group by structural
+ // similarity in a SimilarityGroup, each group is returned in a
+ // SimilarityGroupList.
+ //
+ // \param [in] Modules - the modules to analyze.
+ // \returns The groups of similarity ranges found in the modules.
+ SimilarityGroupList &
+ findSimilarity(ArrayRef<std::unique_ptr<Module>> Modules);
+
+ // Find the IRSimilarityCandidates in the given Module grouped by structural
+ // similarity in a SimilarityGroup, contained inside a SimilarityGroupList.
+ //
+ // \param [in] M - the module to analyze.
+ // \returns The groups of similarity ranges found in the module.
+ SimilarityGroupList &findSimilarity(Module &M);
+
+ // Clears \ref SimilarityCandidates if it is already filled by a previous run.
+ void resetSimilarityCandidates() {
+ // If we've already analyzed a Module or set of Modules, so we must clear
+ // the SimilarityCandidates to make sure we do not have only old values
+ // hanging around.
+ if (SimilarityCandidates.hasValue())
+ SimilarityCandidates->clear();
+ else
+ SimilarityCandidates = SimilarityGroupList();
+ }
+
+ // \returns The groups of similarity ranges found in the most recently passed
+ // set of modules.
+ Optional<SimilarityGroupList> &getSimilarity() {
+ return SimilarityCandidates;
+ }
+
+private:
+ /// The allocator for IRInstructionData.
+ SpecificBumpPtrAllocator<IRInstructionData> InstDataAllocator;
+
+ /// The allocator for IRInstructionDataLists.
+ SpecificBumpPtrAllocator<IRInstructionDataList> InstDataListAllocator;
+
+ /// Map Instructions to unsigned integers and wraps the Instruction in an
+ /// instance of IRInstructionData.
+ IRInstructionMapper Mapper;
+
+ /// The SimilarityGroups found with the most recent run of \ref
+ /// findSimilarity. None if there is no recent run.
+ Optional<SimilarityGroupList> SimilarityCandidates;
+};
+
+} // end namespace IRSimilarity
+
+/// An analysis pass based on legacy pass manager that runs and returns
+/// IRSimilarityIdentifier run on the Module.
+class IRSimilarityIdentifierWrapperPass : public ModulePass {
+ std::unique_ptr<IRSimilarity::IRSimilarityIdentifier> IRSI;
+
+public:
+ static char ID;
+ IRSimilarityIdentifierWrapperPass();
+
+ IRSimilarity::IRSimilarityIdentifier &getIRSI() { return *IRSI; }
+ const IRSimilarity::IRSimilarityIdentifier &getIRSI() const { return *IRSI; }
+
+ bool doInitialization(Module &M) override;
+ bool doFinalization(Module &M) override;
+ bool runOnModule(Module &M) override;
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.setPreservesAll();
+ }
+};
+
+/// An analysis pass that runs and returns the IRSimilarityIdentifier run on the
+/// Module.
+class IRSimilarityAnalysis : public AnalysisInfoMixin<IRSimilarityAnalysis> {
+public:
+ typedef IRSimilarity::IRSimilarityIdentifier Result;
+
+ Result run(Module &M, ModuleAnalysisManager &);
+
+private:
+ friend AnalysisInfoMixin<IRSimilarityAnalysis>;
+ static AnalysisKey Key;
+};
+
+/// Printer pass that uses \c IRSimilarityAnalysis.
+class IRSimilarityAnalysisPrinterPass
+ : public PassInfoMixin<IRSimilarityAnalysisPrinterPass> {
+ raw_ostream &OS;
+
+public:
+ explicit IRSimilarityAnalysisPrinterPass(raw_ostream &OS) : OS(OS) {}
+ PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM);
+};
+
+} // end namespace llvm
+
+#endif // LLVM_ANALYSIS_IRSIMILARITYIDENTIFIER_H
+
+#ifdef __GNUC__
+#pragma GCC diagnostic pop
+#endif