aboutsummaryrefslogtreecommitdiffstats
path: root/contrib/libs/llvm12/include/llvm/Analysis/IRSimilarityIdentifier.h
blob: 4544bfc8b9dfb32ba6ab2dc3d308b287b04ea7ca (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
#pragma once 
 
#ifdef __GNUC__ 
#pragma GCC diagnostic push 
#pragma GCC diagnostic ignored "-Wunused-parameter" 
#endif 
 
//===- IRSimilarityIdentifier.h - Find similarity in a module --------------==// 
// 
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 
// See https://llvm.org/LICENSE.txt for license information. 
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 
// 
//===----------------------------------------------------------------------===// 
// 
// \file 
// Interface file for the IRSimilarityIdentifier for identifying similarities in 
// IR including the IRInstructionMapper, which maps an Instruction to unsigned 
// integers. 
// 
// Two sequences of instructions are called "similar" if they perform the same 
// series of operations for all inputs. 
// 
// \code 
// %1 = add i32 %a, 10 
// %2 = add i32 %a, %1 
// %3 = icmp slt icmp %1, %2 
// \endcode 
// 
// and 
// 
// \code 
// %1 = add i32 11, %a 
// %2 = sub i32 %a, %1 
// %3 = icmp sgt icmp %2, %1 
// \endcode 
// 
// ultimately have the same result, even if the inputs, and structure are 
// slightly different. 
// 
// For instructions, we do not worry about operands that do not have fixed 
// semantic meaning to the program.  We consider the opcode that the instruction 
// has, the types, parameters, and extra information such as the function name, 
// or comparison predicate.  These are used to create a hash to map instructions 
// to integers to be used in similarity matching in sequences of instructions 
// 
// Terminology: 
// An IRSimilarityCandidate is a region of IRInstructionData (wrapped 
// Instructions), usually used to denote a region of similarity has been found. 
// 
// A SimilarityGroup is a set of IRSimilarityCandidates that are structurally 
// similar to one another. 
// 
//===----------------------------------------------------------------------===// 
 
#ifndef LLVM_ANALYSIS_IRSIMILARITYIDENTIFIER_H 
#define LLVM_ANALYSIS_IRSIMILARITYIDENTIFIER_H 
 
#include "llvm/IR/InstVisitor.h" 
#include "llvm/IR/Instructions.h" 
#include "llvm/IR/Module.h" 
#include "llvm/IR/PassManager.h" 
#include "llvm/Pass.h" 
#include "llvm/Support/Allocator.h" 
 
namespace llvm { 
namespace IRSimilarity { 
 
struct IRInstructionDataList; 
 
/// This represents what is and is not supported when finding similarity in 
/// Instructions. 
/// 
/// Legal Instructions are considered when looking at similarity between 
/// Instructions. 
/// 
/// Illegal Instructions cannot be considered when looking for similarity 
/// between Instructions. They act as boundaries between similarity regions. 
/// 
/// Invisible Instructions are skipped over during analysis. 
// TODO: Shared with MachineOutliner 
enum InstrType { Legal, Illegal, Invisible }; 
 
/// This provides the utilities for hashing an Instruction to an unsigned 
/// integer. Two IRInstructionDatas produce the same hash value when their 
/// underlying Instructions perform the same operation (even if they don't have 
/// the same input operands.) 
/// As a more concrete example, consider the following: 
/// 
/// \code 
/// %add1 = add i32 %a, %b 
/// %add2 = add i32 %c, %d 
/// %add3 = add i64 %e, %f 
/// \endcode 
/// 
// Then the IRInstructionData wrappers for these Instructions may be hashed like 
/// so: 
/// 
/// \code 
/// ; These two adds have the same types and operand types, so they hash to the 
/// ; same number. 
/// %add1 = add i32 %a, %b ; Hash: 1 
/// %add2 = add i32 %c, %d ; Hash: 1 
/// ; This add produces an i64. This differentiates it from %add1 and %add2. So, 
/// ; it hashes to a different number. 
/// %add3 = add i64 %e, %f; Hash: 2 
/// \endcode 
/// 
/// 
/// This hashing scheme will be used to represent the program as a very long 
/// string. This string can then be placed in a data structure which can be used 
/// for similarity queries. 
/// 
/// TODO: Handle types of Instructions which can be equal even with different 
/// operands. (E.g. comparisons with swapped predicates.) 
/// TODO: Handle CallInsts, which are only checked for function type 
/// by \ref isSameOperationAs. 
/// TODO: Handle GetElementPtrInsts, as some of the operands have to be the 
/// exact same, and some do not. 
struct IRInstructionData : ilist_node<IRInstructionData> { 
 
  /// The source Instruction that is being wrapped. 
  Instruction *Inst = nullptr; 
  /// The values of the operands in the Instruction. 
  SmallVector<Value *, 4> OperVals; 
  /// The legality of the wrapped instruction. This is informed by InstrType, 
  /// and is used when checking when two instructions are considered similar. 
  /// If either instruction is not legal, the instructions are automatically not 
  /// considered similar. 
  bool Legal; 
 
  /// This is only relevant if we are wrapping a CmpInst where we needed to 
  /// change the predicate of a compare instruction from a greater than form 
  /// to a less than form.  It is None otherwise. 
  Optional<CmpInst::Predicate> RevisedPredicate; 
 
  /// Gather the information that is difficult to gather for an Instruction, or 
  /// is changed. i.e. the operands of an Instruction and the Types of those 
  /// operands. This extra information allows for similarity matching to make 
  /// assertions that allow for more flexibility when checking for whether an 
  /// Instruction performs the same operation. 
  IRInstructionData(Instruction &I, bool Legality, IRInstructionDataList &IDL); 
 
  /// Get the predicate that the compare instruction is using for hashing the 
  /// instruction. the IRInstructionData must be wrapping a CmpInst. 
  CmpInst::Predicate getPredicate() const; 
 
  /// A function that swaps the predicates to their less than form if they are 
  /// in a greater than form. Otherwise, the predicate is unchanged. 
  /// 
  /// \param CI - The comparison operation to find a consistent preidcate for. 
  /// \return the consistent comparison predicate.  
  static CmpInst::Predicate predicateForConsistency(CmpInst *CI); 
 
  /// Hashes \p Value based on its opcode, types, and operand types. 
  /// Two IRInstructionData instances produce the same hash when they perform 
  /// the same operation. 
  /// 
  /// As a simple example, consider the following instructions. 
  /// 
  /// \code 
  /// %add1 = add i32 %x1, %y1 
  /// %add2 = add i32 %x2, %y2 
  /// 
  /// %sub = sub i32 %x1, %y1 
  /// 
  /// %add_i64 = add i64 %x2, %y2 
  /// \endcode 
  /// 
  /// Because the first two adds operate the same types, and are performing the 
  /// same action, they will be hashed to the same value. 
  /// 
  /// However, the subtraction instruction is not the same as an addition, and 
  /// will be hashed to a different value. 
  /// 
  /// Finally, the last add has a different type compared to the first two add 
  /// instructions, so it will also be hashed to a different value that any of 
  /// the previous instructions. 
  /// 
  /// \param [in] ID - The IRInstructionData instance to be hashed. 
  /// \returns A hash_value of the IRInstructionData. 
  friend hash_code hash_value(const IRInstructionData &ID) { 
    SmallVector<Type *, 4> OperTypes; 
    for (Value *V : ID.OperVals) 
      OperTypes.push_back(V->getType()); 
 
    if (isa<CmpInst>(ID.Inst)) 
      return llvm::hash_combine( 
          llvm::hash_value(ID.Inst->getOpcode()), 
          llvm::hash_value(ID.Inst->getType()), 
          llvm::hash_value(ID.getPredicate()), 
          llvm::hash_combine_range(OperTypes.begin(), OperTypes.end())); 
    else if (CallInst *CI = dyn_cast<CallInst>(ID.Inst)) 
      return llvm::hash_combine( 
          llvm::hash_value(ID.Inst->getOpcode()), 
          llvm::hash_value(ID.Inst->getType()), 
          llvm::hash_value(CI->getCalledFunction()->getName().str()), 
          llvm::hash_combine_range(OperTypes.begin(), OperTypes.end())); 
    return llvm::hash_combine( 
        llvm::hash_value(ID.Inst->getOpcode()), 
        llvm::hash_value(ID.Inst->getType()), 
        llvm::hash_combine_range(OperTypes.begin(), OperTypes.end())); 
  } 
 
  IRInstructionDataList *IDL = nullptr; 
}; 
 
struct IRInstructionDataList : simple_ilist<IRInstructionData> {}; 
 
/// Compare one IRInstructionData class to another IRInstructionData class for 
/// whether they are performing a the same operation, and can mapped to the 
/// same value. For regular instructions if the hash value is the same, then 
/// they will also be close. 
/// 
/// \param A - The first IRInstructionData class to compare 
/// \param B - The second IRInstructionData class to compare 
/// \returns true if \p A and \p B are similar enough to be mapped to the same 
/// value. 
bool isClose(const IRInstructionData &A, const IRInstructionData &B); 
 
struct IRInstructionDataTraits : DenseMapInfo<IRInstructionData *> { 
  static inline IRInstructionData *getEmptyKey() { return nullptr; } 
  static inline IRInstructionData *getTombstoneKey() { 
    return reinterpret_cast<IRInstructionData *>(-1); 
  } 
 
  static unsigned getHashValue(const IRInstructionData *E) { 
    using llvm::hash_value; 
    assert(E && "IRInstructionData is a nullptr?"); 
    return hash_value(*E); 
  } 
 
  static bool isEqual(const IRInstructionData *LHS, 
                      const IRInstructionData *RHS) { 
    if (RHS == getEmptyKey() || RHS == getTombstoneKey() || 
        LHS == getEmptyKey() || LHS == getTombstoneKey()) 
      return LHS == RHS; 
 
    assert(LHS && RHS && "nullptr should have been caught by getEmptyKey?"); 
    return isClose(*LHS, *RHS); 
  } 
}; 
 
/// Helper struct for converting the Instructions in a Module into a vector of 
/// unsigned integers. This vector of unsigned integers can be thought of as a 
/// "numeric string". This numeric string can then be queried by, for example, 
/// data structures that find repeated substrings. 
/// 
/// This hashing is done per BasicBlock in the module. To hash Instructions 
/// based off of their operations, each Instruction is wrapped in an 
/// IRInstructionData struct. The unsigned integer for an IRInstructionData 
/// depends on: 
/// - The hash provided by the IRInstructionData. 
/// - Which member of InstrType the IRInstructionData is classified as. 
// See InstrType for more details on the possible classifications, and how they 
// manifest in the numeric string. 
/// 
/// The numeric string for an individual BasicBlock is terminated by an unique 
/// unsigned integer. This prevents data structures which rely on repetition 
/// from matching across BasicBlocks. (For example, the SuffixTree.) 
/// As a concrete example, if we have the following two BasicBlocks: 
/// \code 
/// bb0: 
/// %add1 = add i32 %a, %b 
/// %add2 = add i32 %c, %d 
/// %add3 = add i64 %e, %f 
/// bb1: 
/// %sub = sub i32 %c, %d 
/// \endcode 
/// We may hash the Instructions like this (via IRInstructionData): 
/// \code 
/// bb0: 
/// %add1 = add i32 %a, %b ; Hash: 1 
/// %add2 = add i32 %c, %d; Hash: 1 
/// %add3 = add i64 %e, %f; Hash: 2 
/// bb1: 
/// %sub = sub i32 %c, %d; Hash: 3 
/// %add4 = add i32 %c, %d ; Hash: 1 
/// \endcode 
/// And produce a "numeric string representation" like so: 
/// 1, 1, 2, unique_integer_1, 3, 1, unique_integer_2 
/// 
/// TODO: This is very similar to the MachineOutliner, and should be 
/// consolidated into the same interface. 
struct IRInstructionMapper { 
  /// The starting illegal instruction number to map to. 
  /// 
  /// Set to -3 for compatibility with DenseMapInfo<unsigned>. 
  unsigned IllegalInstrNumber = static_cast<unsigned>(-3); 
 
  /// The next available integer to assign to a legal Instruction to. 
  unsigned LegalInstrNumber = 0; 
 
  /// Correspondence from IRInstructionData to unsigned integers. 
  DenseMap<IRInstructionData *, unsigned, IRInstructionDataTraits> 
      InstructionIntegerMap; 
 
  /// Set if we added an illegal number in the previous step. 
  /// Since each illegal number is unique, we only need one of them between 
  /// each range of legal numbers. This lets us make sure we don't add more 
  /// than one illegal number per range. 
  bool AddedIllegalLastTime = false; 
 
  /// Marks whether we found a illegal instruction in the previous step. 
  bool CanCombineWithPrevInstr = false; 
 
  /// Marks whether we have found a set of instructions that is long enough 
  /// to be considered for similarity. 
  bool HaveLegalRange = false; 
 
  /// This allocator pointer is in charge of holding on to the IRInstructionData 
  /// so it is not deallocated until whatever external tool is using it is done 
  /// with the information. 
  SpecificBumpPtrAllocator<IRInstructionData> *InstDataAllocator = nullptr; 
 
  /// This allocator pointer is in charge of creating the IRInstructionDataList 
  /// so it is not deallocated until whatever external tool is using it is done 
  /// with the information. 
  SpecificBumpPtrAllocator<IRInstructionDataList> *IDLAllocator = nullptr; 
 
  /// Get an allocated IRInstructionData struct using the InstDataAllocator. 
  /// 
  /// \param I - The Instruction to wrap with IRInstructionData. 
  /// \param Legality - A boolean value that is true if the instruction is to 
  /// be considered for similarity, and false if not. 
  /// \param IDL - The InstructionDataList that the IRInstructionData is 
  /// inserted into. 
  /// \returns An allocated IRInstructionData struct. 
  IRInstructionData *allocateIRInstructionData(Instruction &I, bool Legality, 
                                               IRInstructionDataList &IDL); 
 
  /// Get an allocated IRInstructionDataList object using the IDLAllocator. 
  /// 
  /// \returns An allocated IRInstructionDataList object. 
  IRInstructionDataList *allocateIRInstructionDataList(); 
 
  IRInstructionDataList *IDL = nullptr; 
 
  /// Maps the Instructions in a BasicBlock \p BB to legal or illegal integers 
  /// determined by \p InstrType. Two Instructions are mapped to the same value 
  /// if they are close as defined by the InstructionData class above. 
  /// 
  /// \param [in] BB - The BasicBlock to be mapped to integers. 
  /// \param [in,out] InstrList - Vector of IRInstructionData to append to. 
  /// \param [in,out] IntegerMapping - Vector of unsigned integers to append to. 
  void convertToUnsignedVec(BasicBlock &BB, 
                            std::vector<IRInstructionData *> &InstrList, 
                            std::vector<unsigned> &IntegerMapping); 
 
  /// Maps an Instruction to a legal integer. 
  /// 
  /// \param [in] It - The Instruction to be mapped to an integer. 
  /// \param [in,out] IntegerMappingForBB - Vector of unsigned integers to 
  /// append to. 
  /// \param [in,out] InstrListForBB - Vector of InstructionData to append to. 
  /// \returns The integer \p It was mapped to. 
  unsigned mapToLegalUnsigned(BasicBlock::iterator &It, 
                              std::vector<unsigned> &IntegerMappingForBB, 
                              std::vector<IRInstructionData *> &InstrListForBB); 
 
  /// Maps an Instruction to an illegal integer. 
  /// 
  /// \param [in] It - The \p Instruction to be mapped to an integer. 
  /// \param [in,out] IntegerMappingForBB - Vector of unsigned integers to 
  /// append to. 
  /// \param [in,out] InstrListForBB - Vector of IRInstructionData to append to. 
  /// \param End - true if creating a dummy IRInstructionData at the end of a 
  /// basic block. 
  /// \returns The integer \p It was mapped to. 
  unsigned mapToIllegalUnsigned( 
      BasicBlock::iterator &It, std::vector<unsigned> &IntegerMappingForBB, 
      std::vector<IRInstructionData *> &InstrListForBB, bool End = false); 
 
  IRInstructionMapper(SpecificBumpPtrAllocator<IRInstructionData> *IDA, 
                      SpecificBumpPtrAllocator<IRInstructionDataList> *IDLA) 
      : InstDataAllocator(IDA), IDLAllocator(IDLA) { 
    // Make sure that the implementation of DenseMapInfo<unsigned> hasn't 
    // changed. 
    assert(DenseMapInfo<unsigned>::getEmptyKey() == static_cast<unsigned>(-1) && 
           "DenseMapInfo<unsigned>'s empty key isn't -1!"); 
    assert(DenseMapInfo<unsigned>::getTombstoneKey() == 
               static_cast<unsigned>(-2) && 
           "DenseMapInfo<unsigned>'s tombstone key isn't -2!"); 
 
    IDL = new (IDLAllocator->Allocate()) 
        IRInstructionDataList(); 
  } 
 
  /// Custom InstVisitor to classify different instructions for whether it can 
  /// be analyzed for similarity. 
  struct InstructionClassification 
      : public InstVisitor<InstructionClassification, InstrType> { 
    InstructionClassification() {} 
 
    // TODO: Determine a scheme to resolve when the label is similar enough. 
    InstrType visitBranchInst(BranchInst &BI) { return Illegal; } 
    // TODO: Determine a scheme to resolve when the labels are similar enough. 
    InstrType visitPHINode(PHINode &PN) { return Illegal; } 
    // TODO: Handle allocas. 
    InstrType visitAllocaInst(AllocaInst &AI) { return Illegal; } 
    // We exclude variable argument instructions since variable arguments 
    // requires extra checking of the argument list. 
    InstrType visitVAArgInst(VAArgInst &VI) { return Illegal; } 
    // We exclude all exception handling cases since they are so context 
    // dependent. 
    InstrType visitLandingPadInst(LandingPadInst &LPI) { return Illegal; } 
    InstrType visitFuncletPadInst(FuncletPadInst &FPI) { return Illegal; } 
    // DebugInfo should be included in the regions, but should not be 
    // analyzed for similarity as it has no bearing on the outcome of the 
    // program. 
    InstrType visitDbgInfoIntrinsic(DbgInfoIntrinsic &DII) { return Invisible; } 
    // TODO: Handle specific intrinsics. 
    InstrType visitIntrinsicInst(IntrinsicInst &II) { return Illegal; } 
    // We only allow call instructions where the function has a name and 
    // is not an indirect call. 
    InstrType visitCallInst(CallInst &CI) { 
      Function *F = CI.getCalledFunction(); 
      if (!F || CI.isIndirectCall() || !F->hasName()) 
        return Illegal; 
      return Legal; 
    } 
    // TODO: We do not current handle similarity that changes the control flow. 
    InstrType visitInvokeInst(InvokeInst &II) { return Illegal; } 
    // TODO: We do not current handle similarity that changes the control flow. 
    InstrType visitCallBrInst(CallBrInst &CBI) { return Illegal; } 
    // TODO: Handle interblock similarity. 
    InstrType visitTerminator(Instruction &I) { return Illegal; } 
    InstrType visitInstruction(Instruction &I) { return Legal; } 
  }; 
 
  /// Maps an Instruction to a member of InstrType. 
  InstructionClassification InstClassifier; 
}; 
 
/// This is a class that wraps a range of IRInstructionData from one point to 
/// another in the vector of IRInstructionData, which is a region of the 
/// program.  It is also responsible for defining the structure within this 
/// region of instructions. 
/// 
/// The structure of a region is defined through a value numbering system 
/// assigned to each unique value in a region at the creation of the 
/// IRSimilarityCandidate. 
/// 
/// For example, for each Instruction we add a mapping for each new 
/// value seen in that Instruction. 
/// IR:                    Mapping Added: 
/// %add1 = add i32 %a, c1    %add1 -> 3, %a -> 1, c1 -> 2 
/// %add2 = add i32 %a, %1    %add2 -> 4 
/// %add3 = add i32 c2, c1    %add3 -> 6, c2 -> 5 
/// 
/// We can compare IRSimilarityCandidates against one another. 
/// The \ref isSimilar function compares each IRInstructionData against one 
/// another and if we have the same sequences of IRInstructionData that would 
/// create the same hash, we have similar IRSimilarityCandidates. 
/// 
/// We can also compare the structure of IRSimilarityCandidates. If we can 
/// create a mapping of registers in the region contained by one 
/// IRSimilarityCandidate to the region contained by different 
/// IRSimilarityCandidate, they can be considered structurally similar. 
/// 
/// IRSimilarityCandidate1:   IRSimilarityCandidate2: 
/// %add1 = add i32 %a, %b    %add1 = add i32 %d, %e 
/// %add2 = add i32 %a, %c    %add2 = add i32 %d, %f 
/// %add3 = add i32 c1, c2    %add3 = add i32 c3, c4 
/// 
/// Can have the following mapping from candidate to candidate of: 
/// %a -> %d, %b -> %e, %c -> %f, c1 -> c3, c2 -> c4 
/// and can be considered similar. 
/// 
/// IRSimilarityCandidate1:   IRSimilarityCandidate2: 
/// %add1 = add i32 %a, %b    %add1 = add i32 %d, c4 
/// %add2 = add i32 %a, %c    %add2 = add i32 %d, %f 
/// %add3 = add i32 c1, c2    %add3 = add i32 c3, c4 
/// 
/// We cannot create the same mapping since the use of c4 is not used in the 
/// same way as %b or c2. 
class IRSimilarityCandidate { 
private: 
  /// The start index of this IRSimilarityCandidate in the instruction list. 
  unsigned StartIdx = 0; 
 
  /// The number of instructions in this IRSimilarityCandidate. 
  unsigned Len = 0; 
 
  /// The first instruction in this IRSimilarityCandidate. 
  IRInstructionData *FirstInst = nullptr; 
 
  /// The last instruction in this IRSimilarityCandidate. 
  IRInstructionData *LastInst = nullptr; 
 
  /// Global Value Numbering structures 
  /// @{ 
  /// Stores the mapping of the value to the number assigned to it in the 
  /// IRSimilarityCandidate. 
  DenseMap<Value *, unsigned> ValueToNumber; 
  /// Stores the mapping of the number to the value assigned this number. 
  DenseMap<unsigned, Value *> NumberToValue; 
  /// @} 
 
public: 
  /// \param StartIdx - The starting location of the region. 
  /// \param Len - The length of the region. 
  /// \param FirstInstIt - The starting IRInstructionData of the region. 
  /// \param LastInstIt - The ending IRInstructionData of the region. 
  IRSimilarityCandidate(unsigned StartIdx, unsigned Len, 
                        IRInstructionData *FirstInstIt, 
                        IRInstructionData *LastInstIt); 
 
  /// \param A - The first IRInstructionCandidate to compare. 
  /// \param B - The second IRInstructionCandidate to compare. 
  /// \returns True when every IRInstructionData in \p A is similar to every 
  /// IRInstructionData in \p B. 
  static bool isSimilar(const IRSimilarityCandidate &A, 
                        const IRSimilarityCandidate &B); 
 
  /// \param A - The first IRInstructionCandidate to compare. 
  /// \param B - The second IRInstructionCandidate to compare. 
  /// \returns True when every IRInstructionData in \p A is structurally similar 
  /// to \p B. 
  static bool compareStructure(const IRSimilarityCandidate &A, 
                               const IRSimilarityCandidate &B); 
 
  struct OperandMapping { 
    /// The IRSimilarityCandidate that holds the instruction the OperVals were 
    /// pulled from. 
    const IRSimilarityCandidate &IRSC; 
 
    /// The operand values to be analyzed. 
    ArrayRef<Value *> &OperVals; 
 
    /// The current mapping of global value numbers from one IRSimilarityCandidate 
    /// to another IRSimilarityCandidate. 
    DenseMap<unsigned, DenseSet<unsigned>> &ValueNumberMapping; 
  }; 
 
  /// Compare the operands in \p A and \p B and check that the current mapping 
  /// of global value numbers from \p A to \p B and \p B to \A is consistent. 
  /// 
  /// \param A - The first IRInstructionCandidate, operand values, and current 
  /// operand mappings to compare. 
  /// \param B - The second IRInstructionCandidate, operand values, and current 
  /// operand mappings to compare. 
  /// \returns true if the IRSimilarityCandidates operands are compatible. 
  static bool compareNonCommutativeOperandMapping(OperandMapping A, 
                                                  OperandMapping B); 
 
  /// Compare the operands in \p A and \p B and check that the current mapping 
  /// of global value numbers from \p A to \p B and \p B to \A is consistent 
  /// given that the operands are commutative. 
  /// 
  /// \param A - The first IRInstructionCandidate, operand values, and current 
  /// operand mappings to compare. 
  /// \param B - The second IRInstructionCandidate, operand values, and current 
  /// operand mappings to compare. 
  /// \returns true if the IRSimilarityCandidates operands are compatible. 
  static bool compareCommutativeOperandMapping(OperandMapping A, 
                                               OperandMapping B); 
 
  /// Compare the start and end indices of the two IRSimilarityCandidates for 
  /// whether they overlap. If the start instruction of one 
  /// IRSimilarityCandidate is less than the end instruction of the other, and 
  /// the start instruction of one is greater than the start instruction of the 
  /// other, they overlap. 
  /// 
  /// \returns true if the IRSimilarityCandidates do not have overlapping 
  /// instructions. 
  static bool overlap(const IRSimilarityCandidate &A, 
                      const IRSimilarityCandidate &B); 
 
  /// \returns the number of instructions in this Candidate. 
  unsigned getLength() const { return Len; } 
 
  /// \returns the start index of this IRSimilarityCandidate. 
  unsigned getStartIdx() const { return StartIdx; } 
 
  /// \returns the end index of this IRSimilarityCandidate. 
  unsigned getEndIdx() const { return StartIdx + Len - 1; } 
 
  /// \returns The first IRInstructionData. 
  IRInstructionData *front() const { return FirstInst; } 
  /// \returns The last IRInstructionData. 
  IRInstructionData *back() const { return LastInst; } 
 
  /// \returns The first Instruction. 
  Instruction *frontInstruction() { return FirstInst->Inst; } 
  /// \returns The last Instruction 
  Instruction *backInstruction() { return LastInst->Inst; } 
 
  /// \returns The BasicBlock the IRSimilarityCandidate starts in. 
  BasicBlock *getStartBB() { return FirstInst->Inst->getParent(); } 
  /// \returns The BasicBlock the IRSimilarityCandidate ends in. 
  BasicBlock *getEndBB() { return LastInst->Inst->getParent(); } 
 
  /// \returns The Function that the IRSimilarityCandidate is located in. 
  Function *getFunction() { return getStartBB()->getParent(); } 
 
  /// Finds the positive number associated with \p V if it has been mapped. 
  /// \param [in] V - the Value to find. 
  /// \returns The positive number corresponding to the value. 
  /// \returns None if not present. 
  Optional<unsigned> getGVN(Value *V) { 
    assert(V != nullptr && "Value is a nullptr?"); 
    DenseMap<Value *, unsigned>::iterator VNIt = ValueToNumber.find(V); 
    if (VNIt == ValueToNumber.end()) 
      return None; 
    return VNIt->second; 
  } 
 
  /// Finds the Value associate with \p Num if it exists. 
  /// \param [in] Num - the number to find. 
  /// \returns The Value associated with the number. 
  /// \returns None if not present. 
  Optional<Value *> fromGVN(unsigned Num) { 
    DenseMap<unsigned, Value *>::iterator VNIt = NumberToValue.find(Num); 
    if (VNIt == NumberToValue.end()) 
      return None; 
    assert(VNIt->second != nullptr && "Found value is a nullptr!"); 
    return VNIt->second; 
  } 
 
  /// \param RHS -The IRSimilarityCandidate to compare against 
  /// \returns true if the IRSimilarityCandidate is occurs after the 
  /// IRSimilarityCandidate in the program. 
  bool operator<(const IRSimilarityCandidate &RHS) const { 
    return getStartIdx() > RHS.getStartIdx(); 
  } 
 
  using iterator = IRInstructionDataList::iterator; 
  iterator begin() const { return iterator(front()); } 
  iterator end() const { return std::next(iterator(back())); } 
}; 
 
typedef std::vector<IRSimilarityCandidate> SimilarityGroup; 
typedef std::vector<SimilarityGroup> SimilarityGroupList; 
 
/// This class puts all the pieces of the IRInstructionData, 
/// IRInstructionMapper, IRSimilarityCandidate together. 
/// 
/// It first feeds the Module or vector of Modules into the IRInstructionMapper, 
/// and puts all the mapped instructions into a single long list of 
/// IRInstructionData. 
/// 
/// The list of unsigned integers is given to the Suffix Tree or similar data 
/// structure to find repeated subsequences.  We construct an 
/// IRSimilarityCandidate for each instance of the subsequence.  We compare them 
/// against one another since  These repeated subsequences can have different 
/// structure.  For each different kind of structure found, we create a 
/// similarity group. 
/// 
/// If we had four IRSimilarityCandidates A, B, C, and D where A, B and D are 
/// structurally similar to one another, while C is different we would have two 
/// SimilarityGroups: 
/// 
/// SimilarityGroup 1:  SimilarityGroup 2 
/// A, B, D             C 
/// 
/// A list of the different similarity groups is then returned after 
/// analyzing the module. 
class IRSimilarityIdentifier { 
public: 
  IRSimilarityIdentifier() 
      : Mapper(&InstDataAllocator, &InstDataListAllocator) {} 
 
  /// \param M the module to find similarity in. 
  explicit IRSimilarityIdentifier(Module &M) 
      : Mapper(&InstDataAllocator, &InstDataListAllocator) { 
    findSimilarity(M); 
  } 
 
private: 
  /// Map the instructions in the module to unsigned integers, using mapping 
  /// already present in the Mapper if possible. 
  /// 
  /// \param [in] M Module - To map to integers. 
  /// \param [in,out] InstrList - The vector to append IRInstructionData to. 
  /// \param [in,out] IntegerMapping - The vector to append integers to. 
  void populateMapper(Module &M, std::vector<IRInstructionData *> &InstrList, 
                      std::vector<unsigned> &IntegerMapping); 
 
  /// Map the instructions in the modules vector to unsigned integers, using 
  /// mapping already present in the mapper if possible. 
  /// 
  /// \param [in] Modules - The list of modules to use to populate the mapper 
  /// \param [in,out] InstrList - The vector to append IRInstructionData to. 
  /// \param [in,out] IntegerMapping - The vector to append integers to. 
  void populateMapper(ArrayRef<std::unique_ptr<Module>> &Modules, 
                      std::vector<IRInstructionData *> &InstrList, 
                      std::vector<unsigned> &IntegerMapping); 
 
  /// Find the similarity candidates in \p InstrList and corresponding 
  /// \p UnsignedVec 
  /// 
  /// \param [in,out] InstrList - The vector to append IRInstructionData to. 
  /// \param [in,out] IntegerMapping - The vector to append integers to. 
  /// candidates found in the program. 
  void findCandidates(std::vector<IRInstructionData *> &InstrList, 
                      std::vector<unsigned> &IntegerMapping); 
 
public: 
  // Find the IRSimilarityCandidates in the \p Modules and group by structural 
  // similarity in a SimilarityGroup, each group is returned in a 
  // SimilarityGroupList. 
  // 
  // \param [in] Modules - the modules to analyze. 
  // \returns The groups of similarity ranges found in the modules. 
  SimilarityGroupList & 
  findSimilarity(ArrayRef<std::unique_ptr<Module>> Modules); 
 
  // Find the IRSimilarityCandidates in the given Module grouped by structural 
  // similarity in a SimilarityGroup, contained inside a SimilarityGroupList. 
  // 
  // \param [in] M - the module to analyze. 
  // \returns The groups of similarity ranges found in the module. 
  SimilarityGroupList &findSimilarity(Module &M); 
 
  // Clears \ref SimilarityCandidates if it is already filled by a previous run. 
  void resetSimilarityCandidates() { 
    // If we've already analyzed a Module or set of Modules, so we must clear 
    // the SimilarityCandidates to make sure we do not have only old values 
    // hanging around. 
    if (SimilarityCandidates.hasValue()) 
      SimilarityCandidates->clear(); 
    else 
      SimilarityCandidates = SimilarityGroupList(); 
  } 
 
  // \returns The groups of similarity ranges found in the most recently passed 
  // set of modules. 
  Optional<SimilarityGroupList> &getSimilarity() { 
    return SimilarityCandidates; 
  } 
 
private: 
  /// The allocator for IRInstructionData. 
  SpecificBumpPtrAllocator<IRInstructionData> InstDataAllocator; 
 
  /// The allocator for IRInstructionDataLists. 
  SpecificBumpPtrAllocator<IRInstructionDataList> InstDataListAllocator; 
 
  /// Map Instructions to unsigned integers and wraps the Instruction in an 
  /// instance of IRInstructionData. 
  IRInstructionMapper Mapper; 
 
  /// The SimilarityGroups found with the most recent run of \ref 
  /// findSimilarity. None if there is no recent run. 
  Optional<SimilarityGroupList> SimilarityCandidates; 
}; 
 
} // end namespace IRSimilarity 
 
/// An analysis pass based on legacy pass manager that runs and returns 
/// IRSimilarityIdentifier run on the Module. 
class IRSimilarityIdentifierWrapperPass : public ModulePass { 
  std::unique_ptr<IRSimilarity::IRSimilarityIdentifier> IRSI; 
 
public: 
  static char ID; 
  IRSimilarityIdentifierWrapperPass(); 
 
  IRSimilarity::IRSimilarityIdentifier &getIRSI() { return *IRSI; } 
  const IRSimilarity::IRSimilarityIdentifier &getIRSI() const { return *IRSI; } 
 
  bool doInitialization(Module &M) override; 
  bool doFinalization(Module &M) override; 
  bool runOnModule(Module &M) override; 
  void getAnalysisUsage(AnalysisUsage &AU) const override { 
    AU.setPreservesAll(); 
  } 
}; 
 
/// An analysis pass that runs and returns the IRSimilarityIdentifier run on the 
/// Module. 
class IRSimilarityAnalysis : public AnalysisInfoMixin<IRSimilarityAnalysis> { 
public: 
  typedef IRSimilarity::IRSimilarityIdentifier Result; 
 
  Result run(Module &M, ModuleAnalysisManager &); 
 
private: 
  friend AnalysisInfoMixin<IRSimilarityAnalysis>; 
  static AnalysisKey Key; 
}; 
 
/// Printer pass that uses \c IRSimilarityAnalysis. 
class IRSimilarityAnalysisPrinterPass 
    : public PassInfoMixin<IRSimilarityAnalysisPrinterPass> { 
  raw_ostream &OS; 
 
public: 
  explicit IRSimilarityAnalysisPrinterPass(raw_ostream &OS) : OS(OS) {} 
  PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM); 
}; 
 
} // end namespace llvm 
 
#endif // LLVM_ANALYSIS_IRSIMILARITYIDENTIFIER_H 
 
#ifdef __GNUC__ 
#pragma GCC diagnostic pop 
#endif